Source code for rdm.wrappers.wordification.wordification

#
# Author: matic.perovsek@ijs.si
#

from collections import defaultdict
from math import log
import time
import string, itertools
import multiprocessing


def chunks(l, n):
    """ Yield n successive chunks from l.
    """
    newn = int(1.0 * len(l) / n + 0.5)
    for i in xrange(0, n - 1):
        yield l[i * newn:i * newn + newn]
    yield l[n * newn - newn:]


def wordify_examples(
        (name_to_table, connecting_tables, context, index_by_value, target_table_name, word_att_length, ex_idxs)):
    cached_sentences = defaultdict(dict)

    return [
        wordify_example(name_to_table, connecting_tables, context, cached_sentences, index_by_value, target_table_name,
                        word_att_length, target_table_name, ex, set([])) for ex in
        name_to_table[target_table_name].getitemsref(ex_idxs)]


def wordify_example(name_to_table, connecting_tables, context, cached_sentences, index_by_value, target_table_name,
                    word_att_length, data_name, ex, searched_connections):
    """
    Recursively constructs the 'wordification' document for the given example.

        :param data: The given examples ExampleTable
        :param ex: Example for which the document is constructed
    """
    debug = False
    data_name = str(data_name)

    if debug:
        print "======================================"
        print "example:", ex
        print "table name:", data_name
        print "searched_connections:", len(searched_connections), searched_connections
        print "connecting_tables:", len(connecting_tables[data_name]), connecting_tables[data_name]

    ex_pkey_value = data_name in context.pkeys and ex[str(context.pkeys[data_name])]

    if not data_name in cached_sentences or not str(ex_pkey_value) in cached_sentences[data_name]:

        words = []  # word list for every example
        if debug:
            print "words:", len(words)
        # Construct words (tableName_attributeName_attributeValue) from the given table
        for att in name_to_table[data_name].domain.attributes:
            if not str(att.name) in context.pkeys[data_name] and not str(att.name) in context.fkeys[data_name]:
                words.append(att_to_s(data_name) + "_" + att_to_s(att.name) + "_" + att_to_s(ex[att]))

        # Words from pairs of attributes
        single_words = words[:]
        for comb_length in range(word_att_length + 1):
            if comb_length > 1:
                words.extend(["__".join(sorted(b)) for b in itertools.combinations(single_words, comb_length)])

        # Apply the wordification methodology recursively on all connecting tables
        for sec_t_name, sec_fkey, prim_fkey in connecting_tables[data_name]:
            sec_t = name_to_table[sec_t_name]
            if debug:
                print "------------------"
                print "(sec_t,sec_fkey,prim):", (sec_t_name, sec_fkey, prim_fkey)
                print "search this table:", not (sec_t_name,
                                                 sec_fkey) in searched_connections and sec_t_name != target_table_name
                print "search this table:", not prim_fkey or not (data_name,
                                                                  sec_fkey) in searched_connections  # and sec_t!=self.target_table
            if not (sec_t_name, sec_fkey) in searched_connections and sec_t_name != target_table_name and (
                not prim_fkey or not (data_name, sec_fkey) in searched_connections):
                example_indexes = index_by_value[sec_t_name][str(sec_fkey)][str(ex_pkey_value)] if not prim_fkey else \
                index_by_value[sec_t_name][str(prim_fkey)][str(ex[str(sec_fkey)])]

                for sec_ex_idx in example_indexes:
                    words += wordify_example(name_to_table, connecting_tables, context, cached_sentences,
                                             index_by_value, target_table_name, word_att_length, sec_t_name,
                                             sec_t[sec_ex_idx], searched_connections | set(
                            [(sec_t_name, sec_fkey), prim_fkey and (data_name, prim_fkey)]))

        cached_sentences[data_name][str(ex_pkey_value)] = words

    return cached_sentences[data_name][str(ex_pkey_value)]


def att_to_s(att):
    """
    Constructs a "wordification" word for the given attribute

        :param att: Orange attribute
        :rtype: str
    """
    return str(att).title().replace(' ', '').replace('_', '')


[docs]class Wordification(object):
[docs] def __init__(self, target_table, other_tables, context, word_att_length=1, idf=None): """ Wordification object constructor. :param target_table: Orange ExampleTable, representing the primary table :param other_tables: secondary tables, Orange ExampleTables """ self.target_table = target_table self.other_tables = other_tables self.context = context self.word_att_length = word_att_length self.idf = idf self.connecting_tables = defaultdict(list) self.cached_sentences = defaultdict(dict) self.resulting_documents = [] self.resulting_classes = [] self.word_in_how_many_documents = defaultdict(int) self.tf_idfs = defaultdict(dict) self.name_to_table = {} # Finds table connections for primary_table in [target_table] + other_tables: self.name_to_table[primary_table.name] = primary_table for secondary_table in [target_table] + other_tables: if (primary_table.name, secondary_table.name) in self.context.connected: for primary_key, foreign_key in self.context.connected[(primary_table.name, secondary_table.name)]: if self.context.pkeys[primary_table.name] == primary_key: self.connecting_tables[primary_table.name].append((secondary_table.name, foreign_key, None)) self.index_by_value = {} for table in [target_table] + other_tables: self.index_by_value[table.name] = {} for sec_t_name, sec_fkey, prim_fkey in [item for sublist in self.connecting_tables.values() for item in sublist]: sec_t = self.name_to_table[sec_t_name] if not prim_fkey: self.index_by_value[sec_t.name][sec_fkey] = defaultdict(list) for i, ex in enumerate(sec_t): self.index_by_value[sec_t.name][sec_fkey][str(ex[str(sec_fkey)])].append(i) else: if not prim_fkey in self.index_by_value[sec_t.name]: self.index_by_value[sec_t.name][prim_fkey] = defaultdict(list) for i, ex in enumerate(sec_t): self.index_by_value[sec_t.name][prim_fkey][str(ex[str(prim_fkey)])].append(i)
[docs] def run(self, num_of_processes=multiprocessing.cpu_count()): """ Applies the wordification methodology on the target table :param num_of_processes: number of processes """ # class + wordification on every example of the main table p = multiprocessing.Pool(num_of_processes) indices = chunks(range(len(self.target_table)), num_of_processes) # ) for ex_idxs in indices: self.resulting_documents.extend(wordify_examples((self.name_to_table, self.connecting_tables, self.context, self.index_by_value, self.target_table.name, self.word_att_length, ex_idxs))) p.close() p.join() for i, ex in enumerate(self.target_table): self.resulting_classes.append(ex.get_class())
[docs] def calculate_weights(self, measure='tfidf'): """ Counts word frequency and calculates tf-idf values for words in every document. :param measure: example weights approach (can be one of ``tfidf, binary, tf``). """ from math import log # TODO replace with spipy matrices (and calculate with scikit) if measure == 'tfidf': self.calculate_idf() for doc_idx, document in enumerate(self.resulting_documents): train_word_count = defaultdict(int) self.tf_idfs[doc_idx] = {} for word in document: train_word_count[word] += 1 for word in document: if measure == "binary": tf = 1 idf = 1 else: tf = train_word_count[word] idf = 1 if measure == "tf" else (self.idf[word] if word in self.idf else None) if idf != None: self.tf_idfs[doc_idx][word] = tf * idf
def calculate_idf(self): if self.idf: return self.idf elif len(self.word_in_how_many_documents) != 0: raise Exception('Words in document occurence already calculated!') else: for document in self.resulting_documents: for word in set(document): self.word_in_how_many_documents[word] += 1 no_of_documents = len(self.resulting_documents) self.idf = {} for word, count in self.word_in_how_many_documents.items(): self.idf[word] = log(no_of_documents / float(self.word_in_how_many_documents[word]))
[docs] def to_arff(self): ''' Returns the "wordified" representation in ARFF. :rtype: str ''' arff_string = "@RELATION " + self.target_table.name + "\n\n" words = set() for document in self.resulting_documents: for word in document: words.add(word) for i, word in enumerate(words): arff_string += "@ATTRIBUTE\t'" + word.replace("'", "") + "'\tREAL\n" arff_string += "@ATTRIBUTE\tclass\t{" + string.join(set([str(a) for a in self.resulting_classes]), ",") + "}\n\n@DATA\n" for doc_idx in range(len(self.resulting_documents)): features = [] for word in words: if word in self.tf_idfs[doc_idx]: features.append(str(self.tf_idfs[doc_idx][word])) else: features.append("0") features.append(str(self.resulting_classes[doc_idx])) arff_string += string.join(features, ',') arff_string += "\n" return arff_string
[docs] def prune(self, minimum_word_frequency_percentage=1): """ Filter out words that occur less than minimum_word_frequency times. :param minimum_word_frequency_percentage: minimum frequency of words to keep """ pruned_resulting_documents = [] for document in self.resulting_documents: new_document = [] for word in document: if self.word_in_how_many_documents[word] >= minimum_word_frequency_percentage / 100. * len( self.resulting_documents): new_document.append(word) pruned_resulting_documents.append(new_document) self.resulting_documents = pruned_resulting_documents
[docs] def wordify(self): """ Constructs string of all documents. :return: document representation of the dataset, one line per document :rtype: str """ string_documents = [] for klass, document in zip(self.resulting_classes, self.resulting_documents): string_documents.append("!" + str(klass) + " " + string.join(document, " ")) return string.join(string_documents, "\n")
[docs] def att_to_s(self, att): """ Constructs a "wordification" word for the given attribute :param att: Orange attribute """ return str(att).title().replace(' ', '').replace('_', '')