5. Putting it all together

5.1. Pipelining

We have seen that some estimators can transform data, and some estimators can predict variables. We can create combined estimators:

_images/pca_digits_spectrum.png
>>> from scikits.learn import linear_model, decomposition, datasets

>>> logistic = linear_model.LogisticRegression()
>>> pca = decomposition.PCA()
>>> from scikits.learn.pipeline import Pipeline
>>> pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

>>> digits = datasets.load_digits()
>>> X_digits = digits.data
>>> y_digits = digits.target
>>> pca.fit(X_digits, y_digits)
>>> pl.plot(pca.explained_variance_)

Parameters of pipelines can be set using ‘__’ separated parameter names:

>>> pipe._set_params(pca__n_components=30)
Pipeline(steps=[('pca', PCA(copy=True, n_components=30, whiten=False)), ('logistic', LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True,
      penalty='l2', tol=0.0001))])
>>> pca.n_components
30

>>> from scikits.learn.grid_search import GridSearchCV
>>> n_components = [10, 15, 20, 30, 40, 50, 64]
>>> Cs = np.logspace(-4, 4, 16)
>>> estimator = GridSearchCV(pipe,
...                          dict(pca__n_components=n_components,
...                               logistic_C=logistic_Cs),
...                          n_jobs=-1)
>>> estimator.fit(X_digits, y_digits)

5.2. Face recognition with eigenfaces

The dataset used in this example is a preprocessed excerpt of the “Labeled Faces in the Wild”, aka LFW:

http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)
from scikits.learn.cross_val import StratifiedKFold
from scikits.learn.datasets import fetch_lfw_people
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.decomposition import RandomizedPCA
from scikits.learn.svm import SVC

# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# reshape the data using the traditional (n_samples, n_features) shape
faces = lfw_people.data
n_samples, h, w = faces.shape
X = faces.reshape((n_samples, h * w))
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names

# split into a training and testing set
train, test = iter(StratifiedKFold(y, k=4)).next()
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]

# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
eigenfaces = pca.components_.reshape((n_components, h, w))

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a SVM classification model
param_grid = dict(C=[1, 5, 10, 50, 100],
                  gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1])
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'},
                   verbose=1)
clf = clf.fit(X_train_pca, y_train)
print clf.best_estimator

# Quantitative evaluation of the model quality on the test set
from scikits.learn import metrics
y_pred = clf.predict(X_test_pca)
print metrics.classification_report(y_test, y_pred, target_names=target_names)
print metrics.confusion_matrix(y_test, y_pred,
                               labels=range(len(target_names)))


# Plot the results
import pylab as pl
for index, (img, label_true, label_pred) in enumerate(
                zip(X_test[:8], y_test[:8], y_pred[:8])):
    pl.subplot(2, 4, index+1).imshow(img.reshape(h, w), cmap=pl.cm.gray)
    pl.title('%s, prediction: %s' % (label_true, label_pred))
prediction eigenfaces
Prediction Eigenfaces

Expected results for the top 5 most represented people in the dataset:

                   precision    recall  f1-score   support

Gerhard_Schroeder       0.91      0.75      0.82        28
  Donald_Rumsfeld       0.84      0.82      0.83        33
       Tony_Blair       0.65      0.82      0.73        34
     Colin_Powell       0.78      0.88      0.83        58
    George_W_Bush       0.93      0.86      0.90       129

      avg / total       0.86      0.84      0.85       282

5.3. Open problem: stock market structure

Can we predict the variation in stock prices for Google?

import datetime
from matplotlib import finance
import numpy as np

################################################################################
if 1:
    # Choose a time period reasonnably calm: before the 2008 crash)
    # and after Google's start
    d1 = datetime.datetime(2004, 8, 19)
    d2 = datetime.datetime(2008, 01, 01)

    symbol_dict = {
            'TOT'  : 'Total',
            'XOM'  : 'Exxon',
            'CVX'  : 'Chevron',
            'COP'  : 'ConocoPhillips',
            'VLO'  : 'Valero Energy',
            'MSFT' : 'Microsoft',
            'B'    : 'Barnes group',
            'EK'   : 'Eastman kodak',
            'IBM'  : 'IBM',
            'TWX'  : 'Time Warner',
            'CMCSA': 'Comcast',
            'CVC'  : 'Cablevision',
            'YHOO' : 'Yahoo',
            'DELL' : 'Dell',
            'DE'   : 'Deere and company',
            'HPQ'  : 'Hewlett-Packard',
            'AMZN' : 'Amazon',
            'TM'   : 'Toyota',
            'CAJ'  : 'Canon',
            'MTU'  : 'Mitsubishi',
            'SNE'  : 'Sony',
            'EMR'  : 'Emerson electric',
            'F'    : 'Ford',
            'HMC'  : 'Honda',
            'NAV'  : 'Navistar',
            'NOC'  : 'Northrop Grumman',
            'BA'   : 'Boeing',
            'KO'   : 'Coca Cola',
            'MMM'  : '3M',
            'MCD'  : 'Mc Donalds',
            'PEP'  : 'Pepsi',
            'KFT'  : 'Kraft Foods',
            'K'    : 'Kellogg',
            'VOD'  : 'Vodaphone',
            'UN'   : 'Unilever',
            'MAR'  : 'Marriott',
            'PG'   : 'Procter Gamble',
            'CL'   : 'Colgate-Palmolive',
            'NWS'  : 'News Corporation',
            'GE'   : 'General Electrics',
            'WFC'  : 'Wells Fargo',
            'JPM'  : 'JPMorgan Chase',
            'AIG'  : 'AIG',
            'AXP'  : 'American express',
            'BAC'  : 'Bank of America',
            'GS'   : 'Goldman Sachs',
            'PMI'  : 'PMI group',
            'AAPL' : 'Apple',
            'SAP'  : 'SAP',
            'CSCO' : 'Cisco',
            'QCOM' : 'Qualcomm',
            'HAL'  : 'Haliburton',
            'HTCH' : 'Hutchinson',
            'JDSU' : 'JDS uniphase',
            'TXN'  : 'Texas instruments',
            'O'    : 'Reality income',
            'UPS'  : 'UPS',
            'BP'   : 'BP',
            'L'    : 'Loews corporation',
            'M'    : "Macy's",
            'S'    : 'Sprint nextel',
            'XRX'  : 'Xerox',
            'WYNN' : 'Wynn resorts',
            'DIS'  : 'Walt disney',
            'WFR'  : 'MEMC electronic materials',
            'UTX'  : 'United Technology corp',
            'X'    : 'United States Steel corp',
            'LMT'  : 'Lookheed Martin',
            'WMT'  : 'Wal-Mart',
            'WAG'  : 'Walgreen',
            'HD'   : 'Home Depot',
            'GSK'  : 'GlaxoSmithKline',
            'PFE'  : 'Pfizer',
            'SNY'  : 'Sanofi-Aventis',
            'NVS'  : 'Novartis',
            'KMB'  : 'Kimberly-Clark',
            'R'    : 'Ryder',
            'GD'   : 'General Dynamics',
            'RTN'  : 'Raytheon',
            'CVS'  : 'CVS',
            'CAT'  : 'Caterpillar',
            'DD'   : 'DuPont de Nemours',
            'MON'  : 'Monsanto',
            'CLF'  : 'Cliffs natural ressources',
            'BTU'  : 'Peabody energy',
            'ACI'  : 'Arch Coal',
            'BTU'  : 'Patriot coal corp',
            'PPG'  : 'PPC',
            'CMI'  : 'Cummins common stock',
            'JNJ'  : 'Johnson and johnson',
            'ABT'  : 'Abbott laboratories',
            'MRK'  : 'Merck and co',
            'T'    : 'AT and T',
            'VZ'   : 'Verizon',
            'FTR'  : 'Frontiers communication',
            'CTL'  : 'Centurylink',
            'MO'   : 'Altria group',
            'NLY'  : 'Annaly capital management',
            'QQQ'  : 'Powershares',
            'BMY'  : 'Bristal-myers squibb',
            'LLY'  : 'Eli lilly and co',
            'C'    : 'Citigroup',
            'MS'   : 'Morgan Stanley',
            'TGT'  : 'Target corporation',
            'SCHW' : 'Charles schwad',
            'ETFC' : 'E*Trade',
            'AMTD' : 'TD ameritrade holding',
            'INTC' : 'Intel',
            'AMD'  : 'AMD',
            'NOK'  : 'Nokia',
            'MU'   : 'Micron technologies',
            'NVDA' : 'Nvidia',
            'MRVL' : 'Marvel technology group',
            'SNDK' : 'Sandisk',
            'RIMM' : 'Research in mention',
            'TXN'  : 'Texas instruments',
            'EMC'  : 'EMC',
            'ORCL' : 'Oracle',
            'LOW'  : "Lowe's",
            'BBY'  : 'Best buy',
            'FDX'  : 'Fedex',
            'FE'   : 'First energy',
            'JNPR' : 'Juniper',
            'GOOG' : 'Google',
            'AXP'  : 'American express',
            'AMAT' : 'Applied material',
            '^DJI' : 'Dow Jones Industrial average',
            '^DJA' : 'Dow Jones Composite average',
            '^DJT' : 'Dow Jones Transportation average',
            '^DJU' : 'Dow Jones Utility average',
            '^IXIC': 'Nasdaq composite',
            #'^FCHI': 'CAC40',
        }

    symbols, names = np.array(symbol_dict.items()).T

    quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
                    for symbol in symbols]

    #volumes = np.array([q.volume for q in quotes]).astype(np.float)
    open    = np.array([q.open   for q in quotes]).astype(np.float)
    close   = np.array([q.close  for q in quotes]).astype(np.float)
    variation = close - open
    np.save('variation.npy', variation)
    np.save('names.npy', names)
else:
    names = np.load('names.npy')
    variation = np.load('variation.npy')

################################################################################
# Get our X and y variables
X = variation[names != 'Google'].T
y = variation[names == 'Google'].squeeze()
n = names[names != 'Google']