Source code for rdm.wrappers.wordification.wordification

#
# Author: matic.perovsek@ijs.si
#

from collections import defaultdict
from math import log
import time
import string, itertools
import multiprocessing


def chunks(l, n):
    """ Yield n successive chunks from l.
    """
    newn = int(1.0 * len(l) / n + 0.5)
    for i in xrange(0, n - 1):
        yield l[i * newn:i * newn + newn]
    yield l[n * newn - newn:]


def wordify_examples(
        (name_to_table, connecting_tables, context, index_by_value, target_table_name, word_att_length, ex_idxs)):
    cached_sentences = defaultdict(dict)

    return [
        wordify_example(name_to_table, connecting_tables, context, cached_sentences, index_by_value, target_table_name,
                        word_att_length, target_table_name, ex, set([])) for ex in
        name_to_table[target_table_name].getitemsref(ex_idxs)]


def wordify_example(name_to_table, connecting_tables, context, cached_sentences, index_by_value, target_table_name,
                    word_att_length, data_name, ex, searched_connections):
    """
    Recursively constructs the 'wordification' document for the given example.

        :param data: The given examples ExampleTable
        :param ex: Example for which the document is constructed
    """
    debug = False
    data_name = str(data_name)

    if debug:
        print "======================================"
        print "example:", ex
        print "table name:", data_name
        print "searched_connections:", len(searched_connections), searched_connections
        print "connecting_tables:", len(connecting_tables[data_name]), connecting_tables[data_name]

    ex_pkey_value = data_name in context.pkeys and ex[str(context.pkeys[data_name])]

    if not data_name in cached_sentences or not str(ex_pkey_value) in cached_sentences[data_name]:

        words = []  # word list for every example
        if debug:
            print "words:", len(words)
        # Construct words (tableName_attributeName_attributeValue) from the given table
        for att in name_to_table[data_name].domain.attributes:
            if not str(att.name) in context.pkeys[data_name] and not str(att.name) in context.fkeys[data_name]:
                words.append(att_to_s(data_name) + "_" + att_to_s(att.name) + "_" + att_to_s(ex[att]))

        # Words from pairs of attributes
        single_words = words[:]
        for comb_length in range(word_att_length + 1):
            if comb_length > 1:
                words.extend(["__".join(sorted(b)) for b in itertools.combinations(single_words, comb_length)])

        # Apply the wordification methodology recursively on all connecting tables
        for sec_t_name, sec_fkey, prim_fkey in connecting_tables[data_name]:
            sec_t = name_to_table[sec_t_name]
            if debug:
                print "------------------"
                print "(sec_t,sec_fkey,prim):", (sec_t_name, sec_fkey, prim_fkey)
                print "search this table:", not (sec_t_name,
                                                 sec_fkey) in searched_connections and sec_t_name != target_table_name
                print "search this table:", not prim_fkey or not (data_name,
                                                                  sec_fkey) in searched_connections  # and sec_t!=self.target_table
            if not (sec_t_name, sec_fkey) in searched_connections and sec_t_name != target_table_name and (
                not prim_fkey or not (data_name, sec_fkey) in searched_connections):
                example_indexes = index_by_value[sec_t_name][str(sec_fkey)][str(ex_pkey_value)] if not prim_fkey else \
                index_by_value[sec_t_name][str(prim_fkey)][str(ex[str(sec_fkey)])]

                for sec_ex_idx in example_indexes:
                    words += wordify_example(name_to_table, connecting_tables, context, cached_sentences,
                                             index_by_value, target_table_name, word_att_length, sec_t_name,
                                             sec_t[sec_ex_idx], searched_connections | set(
                            [(sec_t_name, sec_fkey), prim_fkey and (data_name, prim_fkey)]))

        cached_sentences[data_name][str(ex_pkey_value)] = words

    return cached_sentences[data_name][str(ex_pkey_value)]


def att_to_s(att):
    """
    Constructs a "wordification" word for the given attribute

        :param att: Orange attribute
        :rtype: str
    """
    return str(att).title().replace(' ', '').replace('_', '')


[docs]class Wordification(object):
[docs]    def __init__(self, target_table, other_tables, context, word_att_length=1, idf=None):
        """
        Wordification object constructor.
        
            :param target_table: Orange ExampleTable, representing the primary table
            :param other_tables: secondary tables, Orange ExampleTables
        """
        self.target_table = target_table
        self.other_tables = other_tables
        self.context = context
        self.word_att_length = word_att_length
        self.idf = idf

        self.connecting_tables = defaultdict(list)
        self.cached_sentences = defaultdict(dict)
        self.resulting_documents = []
        self.resulting_classes = []

        self.word_in_how_many_documents = defaultdict(int)
        self.tf_idfs = defaultdict(dict)
        self.name_to_table = {}

        # Finds table connections
        for primary_table in [target_table] + other_tables:
            self.name_to_table[primary_table.name] = primary_table
            for secondary_table in [target_table] + other_tables:
                if (primary_table.name, secondary_table.name) in self.context.connected:
                    for primary_key, foreign_key in self.context.connected[(primary_table.name, secondary_table.name)]:
                        if self.context.pkeys[primary_table.name] == primary_key:
                            self.connecting_tables[primary_table.name].append((secondary_table.name, foreign_key, None))

        self.index_by_value = {}
        for table in [target_table] + other_tables:
            self.index_by_value[table.name] = {}
        for sec_t_name, sec_fkey, prim_fkey in [item for sublist in self.connecting_tables.values() for item in
                                                sublist]:
            sec_t = self.name_to_table[sec_t_name]
            if not prim_fkey:
                self.index_by_value[sec_t.name][sec_fkey] = defaultdict(list)
                for i, ex in enumerate(sec_t):
                    self.index_by_value[sec_t.name][sec_fkey][str(ex[str(sec_fkey)])].append(i)
            else:
                if not prim_fkey in self.index_by_value[sec_t.name]:
                    self.index_by_value[sec_t.name][prim_fkey] = defaultdict(list)

                for i, ex in enumerate(sec_t):
                    self.index_by_value[sec_t.name][prim_fkey][str(ex[str(prim_fkey)])].append(i)

[docs]    def run(self, num_of_processes=multiprocessing.cpu_count()):
        """
        Applies the wordification methodology on the target table

            :param num_of_processes: number of processes
        """

        # class + wordification on every example of the main table

        p = multiprocessing.Pool(num_of_processes)

        indices = chunks(range(len(self.target_table)), num_of_processes)  # )

        for ex_idxs in indices:
            self.resulting_documents.extend(wordify_examples((self.name_to_table, self.connecting_tables, self.context,
                                                              self.index_by_value, self.target_table.name,
                                                              self.word_att_length, ex_idxs)))
        p.close()
        p.join()

        for i, ex in enumerate(self.target_table):
            self.resulting_classes.append(ex.get_class())

[docs]    def calculate_weights(self, measure='tfidf'):
        """
        Counts word frequency and calculates tf-idf values for words in every document.

            :param measure: example weights approach (can be one of ``tfidf, binary, tf``).
        """
        from math import log

        # TODO replace with spipy matrices (and calculate with scikit)

        if measure == 'tfidf':
            self.calculate_idf()

        for doc_idx, document in enumerate(self.resulting_documents):
            train_word_count = defaultdict(int)
            self.tf_idfs[doc_idx] = {}
            for word in document:
                train_word_count[word] += 1

            for word in document:
                if measure == "binary":
                    tf = 1
                    idf = 1
                else:
                    tf = train_word_count[word]
                    idf = 1 if measure == "tf" else (self.idf[word] if word in self.idf else None)

                if idf != None:
                    self.tf_idfs[doc_idx][word] = tf * idf

    def calculate_idf(self):
        if self.idf:
            return self.idf
        elif len(self.word_in_how_many_documents) != 0:
            raise Exception('Words in document occurence already calculated!')
        else:
            for document in self.resulting_documents:
                for word in set(document):
                    self.word_in_how_many_documents[word] += 1

            no_of_documents = len(self.resulting_documents)
            self.idf = {}
            for word, count in self.word_in_how_many_documents.items():
                self.idf[word] = log(no_of_documents / float(self.word_in_how_many_documents[word]))

[docs]    def to_arff(self):
        '''
        Returns the "wordified" representation in ARFF.

            :rtype: str
        '''
        arff_string = "@RELATION " + self.target_table.name + "\n\n"
        words = set()
        for document in self.resulting_documents:
            for word in document:
                words.add(word)

        for i, word in enumerate(words):
            arff_string += "@ATTRIBUTE\t'" + word.replace("'", "") + "'\tREAL\n"

        arff_string += "@ATTRIBUTE\tclass\t{" + string.join(set([str(a) for a in self.resulting_classes]),
                                                            ",") + "}\n\n@DATA\n"

        for doc_idx in range(len(self.resulting_documents)):
            features = []
            for word in words:
                if word in self.tf_idfs[doc_idx]:
                    features.append(str(self.tf_idfs[doc_idx][word]))
                else:
                    features.append("0")
            features.append(str(self.resulting_classes[doc_idx]))

            arff_string += string.join(features, ',')
            arff_string += "\n"

        return arff_string

[docs]    def prune(self, minimum_word_frequency_percentage=1):
        """
        Filter out words that occur less than minimum_word_frequency times.

            :param minimum_word_frequency_percentage: minimum frequency of words to keep
        """
        pruned_resulting_documents = []

        for document in self.resulting_documents:
            new_document = []
            for word in document:
                if self.word_in_how_many_documents[word] >= minimum_word_frequency_percentage / 100. * len(
                        self.resulting_documents):
                    new_document.append(word)
            pruned_resulting_documents.append(new_document)
        self.resulting_documents = pruned_resulting_documents

[docs]    def wordify(self):
        """
        Constructs string of all documents.

            :return: document representation of the dataset, one line per document
            :rtype: str
        """
        string_documents = []
        for klass, document in zip(self.resulting_classes, self.resulting_documents):
            string_documents.append("!" + str(klass) + " " + string.join(document, " "))
        return string.join(string_documents, "\n")

[docs]    def att_to_s(self, att):
        """
        Constructs a "wordification" word for the given attribute

            :param att: Orange attribute
        """
        return str(att).title().replace(' ', '').replace('_', '')