Example use case¶

import orange

from rdm.db import DBVendor, DBConnection, DBContext, RSDConverter, mapper
from rdm.wrappers import RSD
from rdm.validation import cv_split
from rdm.helpers import arff_to_orange_table

# Provide connection information
connection = DBConnection(
    'ilp',  # User
    'ilp123',  # Password
    'ged.ijs.si',  # Host
    'imdb_top',  # Database
    vendor=DBVendor.MySQL
)

# Define learning context
context = DBContext(connection, target_table='movies', target_att='quality')

# Cross-validation loop
predictions = []
folds = 10
for train_context, test_context in cv_split(context, folds=folds, random_seed=0):
    # Find features on the train set
    conv = RSDConverter(train_context)
    rsd = RSD()
    features, train_arff, _ = rsd.induce(
        conv.background_knowledge(),   # Background knowledge
        examples=conv.all_examples(),  # Training examples
        cn2sd=False                    # Disable built-in subgroup discovery
    )

    # Train the classifier on the *train set*
    train_data = arff_to_orange_table(train_arff)
    tree_classifier = orange.TreeLearner(train_data, max_depth=5)

    # Map the *test set* using the features from the train set
    test_arff = mapper.domain_map(features, 'rsd', train_context, test_context, format='arff')

    # Classify the test set
    test_data = arff_to_orange_table(test_arff)
    fold_predictions = [(ex[-1], tree_classifier(ex)) for ex in test_data]
    predictions.append(fold_predictions)

acc = 0
for fold_predictions in predictions:
    acc += sum([1.0 for actual, predicted in fold_predictions if actual == predicted])/len(fold_predictions)
acc = 100 * acc/folds

print 'Estimated predictive accuracy: {0:.2f}%'.format(acc)