PK åDCK”åFwV V plot_measuring_performance.py"""
Measuring Decision Tree performance
====================================
Demonstrates overfit when testing on train set.
"""
############################################################
# Get the data
from sklearn.datasets import load_boston
data = load_boston()
############################################################
# Train and test a model
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor().fit(data.data, data.target)
predicted = clf.predict(data.data)
expected = data.target
############################################################
# Plot predicted as a function of expected
from matplotlib import pyplot as plt
plt.figure(figsize=(4, 3))
plt.scatter(expected, predicted)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
plt.tight_layout()
############################################################
# Pretty much no errors!
#
# This is too good to be true: we are testing the model on the train
# data, which is not a mesure of generalization.
#
# **The results are not valid**
PK åDCKå ÛÚd d plot_pca.py"""
===============
Demo PCA in 2D
===============
"""
############################################################
# Load the iris data
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
############################################################
# Fit a PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)
pca.fit(X)
############################################################
# Project the data in 2D
X_pca = pca.transform(X)
############################################################
# Visualize the data
target_ids = range(len(iris.target_names))
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
for i, c, label in zip(target_ids, 'rgbcmykw', iris.target_names):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1],
c=c, label=label)
plt.legend()
plt.show()
PK åDCK¹@-‚Š Š plot_linear_regression.py"""
A simple linear regression
===========================
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# x from 0 to 30
x = 30 * np.random.random((20, 1))
# y = a*x + b with noise
y = 0.5 * x + 1.0 + np.random.normal(size=x.shape)
# create a linear regression model
model = LinearRegression()
model.fit(x, y)
# predict y from the data
x_new = np.linspace(0, 30, 100)
y_new = model.predict(x_new[:, np.newaxis])
# plot the results
plt.figure(figsize=(4, 3))
ax = plt.axes()
ax.scatter(x, y)
ax.plot(x_new, y_new)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.axis('tight')
plt.show()
PK åDCKJ7rD D plot_iris_scatter.py"""
Plot 2D views of the iris dataset
=================================
Plot a simple scatter plot of 2 features of the iris dataset.
Note that more elaborate visualization of this dataset is detailed
in the :ref:`statistics` chapter.
"""
# Load the data
from sklearn.datasets import load_iris
iris = load_iris()
from matplotlib import pyplot as plt
# The indices of the features that we are plotting
x_index = 0
y_index = 1
# this formatter will label the colorbar with the correct target names
formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])
plt.figure(figsize=(5, 4))
plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target)
plt.colorbar(ticks=[0, 1, 2], format=formatter)
plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])
plt.tight_layout()
plt.show()
PK åDCK»‰´@ @ plot_tsne.py"""
==========================
tSNE to visualize digits
==========================
Here we use :class:`sklearn.manifold.TSNE` to visualize the digits
datasets. Indeed, the digits are vectors in a 8*8 = 64 dimensional space.
We want to project them in 2D for visualization. tSNE is often a good
solution, as it groups and separates data points based on their local
relationship.
"""
############################################################
# Load the iris data
from sklearn import datasets
digits = datasets.load_digits()
# Take the first 500 data points: it's hard to see 1500 points
X = digits.data[:500]
y = digits.target[:500]
############################################################
# Fit and transform with a TSNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
############################################################
# Project the data in 2D
X_2d = tsne.fit_transform(X)
############################################################
# Visualize the data
target_ids = range(len(digits.target_names))
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple'
for i, c, label in zip(target_ids, colors, digits.target_names):
plt.scatter(X_2d[y == i, 0], X_2d[y == i, 1], c=c, label=label)
plt.legend()
plt.show()
PK åDCKb°3c c plot_linear_model_cv.py"""
================================================================
Use the RidgeCV and LassoCV to set the regularization parameter
================================================================
"""
############################################################
# Load the diabetes dataset
from sklearn.datasets import load_diabetes
data = load_diabetes()
X, y = data.data, data.target
print(X.shape)
############################################################
# Compute the cross-validation score with the default hyper-parameters
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
############################################################
# We compute the cross-validation score as a function of alpha, the
# strength of the regularization for Lasso and Ridge
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
PK åDCKxƒÛ plot_variance_linear_regr.py"""
==================================================
Plot variance and regularization in linear models
==================================================
"""
import numpy as np
# Smaller figures
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (3, 2)
############################################################
# We consider the situation where we have only 2 data point
X = np.c_[ .5, 1].T
y = [.5, 1]
X_test = np.c_[ 0, 2].T
############################################################
# Without noise, as linear regression fits the data perfectly
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X, y)
plt.plot(X, y, 'o')
plt.plot(X_test, regr.predict(X_test))
############################################################
# In real life situation, we have noise (e.g. measurement noise) in our data:
np.random.seed(0)
for _ in range(6):
noisy_X = X + np.random.normal(loc=0, scale=.1, size=X.shape)
plt.plot(noisy_X, y, 'o')
regr.fit(noisy_X, y)
plt.plot(X_test, regr.predict(X_test))
############################################################
# As we can see, our linear model captures and amplifies the noise in the
# data. It displays a lot of variance.
#
# We can use another linear estimator that uses regularization, the
# :class:`~sklearn.linear_model.Ridge` estimator. This estimator
# regularizes the coefficients by shrinking them to zero, under the
# assumption that very high correlations are often spurious. The alpha
# parameter controls the amount of shrinkage used.
regr = linear_model.Ridge(alpha=.1)
np.random.seed(0)
for _ in range(6):
noisy_X = X + np.random.normal(loc=0, scale=.1, size=X.shape)
plt.plot(noisy_X, y, 'o')
regr.fit(noisy_X, y)
plt.plot(X_test, regr.predict(X_test))
plt.show()
PK åDCKþ’¢¡ ¡ plot_separator.py"""
Simple picture of the formal problem of machine learning
=========================================================
This example generates simple synthetic data ploints and shows a
separating hyperplane on them.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.datasets.samples_generator import make_blobs
# we create 50 separable synthetic points
X, Y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
# fit the model
clf = SGDClassifier(loss="hinge", alpha=0.01,
n_iter=200, fit_intercept=True)
clf.fit(X, Y)
# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-1, 5, 10)
yy = np.linspace(-1, 5, 10)
X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
x1 = val
x2 = X2[i, j]
p = clf.decision_function([[x1, x2]])
Z[i, j] = p[0]
plt.figure(figsize=(4, 3))
ax = plt.axes()
ax.contour(X1, X2, Z, [-1.0, 0.0, 1.0], colors='k',
linestyles=['dashed', 'solid', 'dashed'])
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
ax.axis('tight')
plt.show()
PK åDCK°J©w w plot_compare_classifiers.py"""
Compare classifiers on the digits data
=======================================
Compare the performance of a variety of classifiers on a test set for the
digits data.
"""
from sklearn import model_selection, datasets, metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
test_size=0.25, random_state=0)
for Model in [LinearSVC, GaussianNB, KNeighborsClassifier]:
clf = Model().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('%s: %s' %
(Model.__name__, metrics.f1_score(y_test, y_pred, average="macro")))
print('------------------')
# test SVC loss
for loss in ['l1', 'l2']:
clf = LinearSVC(loss=loss).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("LinearSVC(loss='{0}'): {1}".format(loss,
metrics.f1_score(y_test, y_pred, average="macro")))
print('-------------------')
# test the number of neighbors
for n_neighbors in range(1, 11):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("KNeighbors(n_neighbors={0}): {1}".format(n_neighbors,
metrics.f1_score(y_test, y_pred, average="macro")))
PK åDCKF¶^·s s plot_polynomial_regression.py"""
Plot fitting a 9th order polynomial
====================================
Fits data generated from a 9th order polynomial with model of 4th order
and 9th order polynomials, to demonstrate that often simpler models are
to be prefered
"""
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import linear_model
# Create color maps for 3-class classification problem, as with iris
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
rng = np.random.RandomState(0)
x = 2*rng.rand(100) - 1
f = lambda t: 1.2 * t**2 + .1 * t**3 - .4 * t **5 - .5 * t ** 9
y = f(x) + .4 * rng.normal(size=100)
x_test = np.linspace(-1, 1, 100)
###########################################################################
# The data
plt.figure(figsize=(6, 4))
plt.scatter(x, y, s=4)
###########################################################################
# Fitting 4th and 9th order polynomials
#
# For this we need to engineer features: the n_th powers of x:
plt.figure(figsize=(6, 4))
plt.scatter(x, y, s=4)
X = np.array([x**i for i in range(5)]).T
X_test = np.array([x_test**i for i in range(5)]).T
regr = linear_model.LinearRegression()
regr.fit(X, y)
plt.plot(x_test, regr.predict(X_test), label='4th order')
X = np.array([x**i for i in range(10)]).T
X_test = np.array([x_test**i for i in range(10)]).T
regr = linear_model.LinearRegression()
regr.fit(X, y)
plt.plot(x_test, regr.predict(X_test), label='9th order')
plt.legend(loc='best')
plt.axis('tight')
plt.title('Fitting a 4th and a 9th order polynomial')
###########################################################################
# Ground truth
plt.figure(figsize=(6, 4))
plt.scatter(x, y, s=4)
plt.plot(x_test, f(x_test), label="truth")
plt.axis('tight')
plt.title('Ground truth (9th order polynomial)')
plt.show()
PK åDCKúà « « plot_boston_prediction.py"""
A simple regression analysis on the Boston housing data
========================================================
Here we perform a simple regression analysis on the Boston housing
data, exploring two types of regressors.
"""
from sklearn.datasets import load_boston
data = load_boston()
##############################################################################
# Print a histogram of the quantity to predict: price
import matplotlib.pyplot as plt
plt.figure(figsize=(4, 3))
plt.hist(data.target)
plt.xlabel('price ($1000s)')
plt.ylabel('count')
plt.tight_layout()
##############################################################################
# Print the join histogram for each feature
for index, feature_name in enumerate(data.feature_names):
plt.figure(figsize=(4, 3))
plt.scatter(data.data[:, index], data.target)
plt.ylabel('Price', size=15)
plt.xlabel(feature_name, size=15)
plt.tight_layout()
##############################################################################
# Simple prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
expected = y_test
plt.figure(figsize=(4, 3))
plt.scatter(expected, predicted)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
plt.tight_layout()
##############################################################################
# Prediction with gradient boosted tree
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
expected = y_test
plt.figure(figsize=(4, 3))
plt.scatter(expected, predicted)
plt.plot([0, 50], [0, 50], '--k')
plt.axis('tight')
plt.xlabel('True price ($1000s)')
plt.ylabel('Predicted price ($1000s)')
plt.tight_layout()
##############################################################################
# Print the error rate
import numpy as np
print("RMS: %r " % np.sqrt(np.mean((predicted - expected) ** 2)))
plt.show()
PK åDCK¥V„¼¸ ¸ plot_iris_knn.py"""
Nearest-neighbor prediction on iris
====================================
Plot the decision boundary of nearest neighbor decision on iris, first
with a single nearest neighbor, and then using 3 nearest neighbors.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap
# Create color maps for 3-class classification problem, as with iris
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
###############################################################################
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.axis('tight')
###############################################################################
# And now, redo the analysis with 3 neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.axis('tight')
plt.show()
PK åDCK
Š
plot_digits_simple_classif.py"""
Simple visualization and classification of the digits dataset
=============================================================
Plot the first few samples of the digits dataset and a 2D representation
built using PCA, then do a simple classification
"""
from sklearn.datasets import load_digits
digits = load_digits()
###############################################################################
# Plot the data: images of digits
# -------------------------------
#
# Each data in a 8x8 image
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(digits.target[i]))
###############################################################################
# Plot a projection on the 2 first principal axis
# ------------------------------------------------
plt.figure()
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
proj = pca.fit_transform(digits.data)
plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)
plt.colorbar()
###############################################################################
# Classify with Gaussian naive Bayes
# ----------------------------------
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
# split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)
# train the model
clf = GaussianNB()
clf.fit(X_train, y_train)
# use the model to predict the labels of the test data
predicted = clf.predict(X_test)
expected = y_test
# Plot the prediction
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(X_test.reshape(-1, 8, 8)[i], cmap=plt.cm.binary,
interpolation='nearest')
# label the image with the target value
if predicted[i] == expected[i]:
ax.text(0, 7, str(predicted[i]), color='green')
else:
ax.text(0, 7, str(predicted[i]), color='red')
###############################################################################
# Quantify the performance
# ------------------------
#
# First print the number of correct matches
matches = (predicted == expected)
print(matches.sum())
###############################################################################
# The total number of data points
print(len(matches))
###############################################################################
# And now, the ration of correct predictions
matches.sum() / float(len(matches))
###############################################################################
# Print the classification report
from sklearn import metrics
print(metrics.classification_report(expected, predicted))
###############################################################################
# Print the confusion matrix
print(metrics.confusion_matrix(expected, predicted))
plt.show()
PK åDCKs{ plot_eigenfaces.py"""
The eigenfaces example: chaining PCA and SVMs
=============================================
The goal of this example is to show how an unsupervised method and a
supervised one can be chained for better prediction. It starts with a
didactic but lengthy way of doing things, and finishes with the
idiomatic approach to pipelining in scikit-learn.
Here we'll take a look at a simple facial recognition example. Ideally,
we would use a dataset consisting of a subset of the `Labeled Faces in
the Wild `__ data that is available
with :func:`sklearn.datasets.fetch_lfw_people`. However, this is a
relatively large download (~200MB) so we will do the tutorial on a
simpler, less rich dataset. Feel free to explore the LFW dataset.
"""
from sklearn import datasets
faces = datasets.fetch_olivetti_faces()
faces.data.shape
############################################################
# Let's visualize these faces to see what we're working with
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(8, 6))
# plot several images
for i in range(15):
ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
ax.imshow(faces.images[i], cmap=plt.cm.bone)
############################################################
# .. tip::
#
# Note is that these faces have already been localized and scaled to a
# common size. This is an important preprocessing piece for facial
# recognition, and is a process that can require a large collection of
# training data. This can be done in scikit-learn, but the challenge is
# gathering a sufficient amount of training data for the algorithm to work.
# Fortunately, this piece is common enough that it has been done. One good
# resource is
# `OpenCV `__, the
# *Open Computer Vision Library*.
#
# We'll perform a Support Vector classification of the images. We'll do a
# typical train-test split on the images:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(faces.data,
faces.target, random_state=0)
print(X_train.shape, X_test.shape)
############################################################
# Preprocessing: Principal Component Analysis
# -------------------------------------------
#
# 1850 dimensions is a lot for SVM. We can use PCA to reduce these 1850
# features to a manageable size, while maintaining most of the information
# in the dataset.
from sklearn import decomposition
pca = decomposition.PCA(n_components=150, whiten=True)
pca.fit(X_train)
############################################################
# One interesting part of PCA is that it computes the "mean" face, which
# can be interesting to examine:
plt.imshow(pca.mean_.reshape(faces.images[0].shape),
cmap=plt.cm.bone)
############################################################
# The principal components measure deviations about this mean along
# orthogonal axes.
print(pca.components_.shape)
############################################################
# It is also interesting to visualize these principal components:
fig = plt.figure(figsize=(16, 6))
for i in range(30):
ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
ax.imshow(pca.components_[i].reshape(faces.images[0].shape),
cmap=plt.cm.bone)
############################################################
# The components ("eigenfaces") are ordered by their importance from
# top-left to bottom-right. We see that the first few components seem to
# primarily take care of lighting conditions; the remaining components
# pull out certain identifying features: the nose, eyes, eyebrows, etc.
#
# With this projection computed, we can now project our original training
# and test data onto the PCA basis:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
############################################################
print(X_test_pca.shape)
############################################################
# These projected components correspond to factors in a linear combination
# of component images such that the combination approaches the original
# face.
#
# Doing the Learning: Support Vector Machines
# -------------------------------------------
#
# Now we'll perform support-vector-machine classification on this reduced
# dataset:
from sklearn import svm
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)
############################################################
# Finally, we can evaluate how well this classification did. First, we
# might plot a few of the test-cases with the labels learned from the
# training set:
import numpy as np
fig = plt.figure(figsize=(8, 6))
for i in range(15):
ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
ax.imshow(X_test[i].reshape(faces.images[0].shape),
cmap=plt.cm.bone)
y_pred = clf.predict(X_test_pca[i, np.newaxis])[0]
color = ('black' if y_pred == y_test[i] else 'red')
ax.set_title(faces.target[y_pred],
fontsize='small', color=color)
############################################################
# The classifier is correct on an impressive number of images given the
# simplicity of its learning model! Using a linear classifier on 150
# features derived from the pixel-level data, the algorithm correctly
# identifies a large number of the people in the images.
#
# Again, we can quantify this effectiveness using one of several measures
# from :mod:`sklearn.metrics`. First we can do the classification
# report, which shows the precision, recall and other measures of the
# "goodness" of the classification:
from sklearn import metrics
y_pred = clf.predict(X_test_pca)
print(metrics.classification_report(y_test, y_pred))
############################################################
# Another interesting metric is the *confusion matrix*, which indicates
# how often any two items are mixed-up. The confusion matrix of a perfect
# classifier would only have nonzero entries on the diagonal, with zeros
# on the off-diagonal:
print(metrics.confusion_matrix(y_test, y_pred))
############################################################
# Pipelining
# ----------
#
# Above we used PCA as a pre-processing step before applying our support
# vector machine classifier. Plugging the output of one estimator directly
# into the input of a second estimator is a commonly used pattern; for
# this reason scikit-learn provides a ``Pipeline`` object which automates
# this process. The above problem can be re-expressed as a pipeline as
# follows:
from sklearn.pipeline import Pipeline
clf = Pipeline([('pca', decomposition.PCA(n_components=150, whiten=True)),
('svm', svm.LinearSVC(C=1.0))])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.confusion_matrix(y_pred, y_test))
plt.show()
############################################################
# A Note on Facial Recognition
# ----------------------------
#
# Here we have used PCA "eigenfaces" as a pre-processing step for facial
# recognition. The reason we chose this is because PCA is a
# broadly-applicable technique, which can be useful for a wide array of
# data types. Research in the field of facial recognition in particular,
# however, has shown that other more specific feature extraction methods
# are can be much more effective.
PK åDCKºF~
~
plot_svm_non_linear.py"""
Example of linear and non-linear models
========================================
This is an example plot from the tutorial which accompanies an explanation
of the support vector machine GUI.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
##############################################################################
# data that is linearly separable
def linear_model(rseed=42, n_samples=30):
" Generate data according to a linear model"
np.random.seed(rseed)
data = np.random.normal(0, 10, (n_samples, 2))
data[:n_samples // 2] -= 15
data[n_samples // 2:] += 15
labels = np.ones(n_samples)
labels[:n_samples // 2] = -1
return data, labels
X, y = linear_model()
clf = svm.SVC(kernel='linear')
clf.fit(X, y)
plt.figure(figsize=(6, 4))
ax = plt.subplot(111, xticks=[], yticks=[])
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bone)
ax.scatter(clf.support_vectors_[:, 0],
clf.support_vectors_[:, 1],
s=80, edgecolors="k", facecolors="none")
delta = 1
y_min, y_max = -50, 50
x_min, x_max = -50, 50
x = np.arange(x_min, x_max + delta, delta)
y = np.arange(y_min, y_max + delta, delta)
X1, X2 = np.meshgrid(x, y)
Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()])
Z = Z.reshape(X1.shape)
ax.contour(X1, X2, Z, [-1.0, 0.0, 1.0], colors='k',
linestyles=['dashed', 'solid', 'dashed'])
##############################################################################
# data with a non-linear separation
def nonlinear_model(rseed=42, n_samples=30):
radius = 40 * np.random.random(n_samples)
far_pts = radius > 20
radius[far_pts] *= 1.2
radius[~far_pts] *= 1.1
theta = np.random.random(n_samples) * np.pi * 2
data = np.empty((n_samples, 2))
data[:, 0] = radius * np.cos(theta)
data[:, 1] = radius * np.sin(theta)
labels = np.ones(n_samples)
labels[far_pts] = -1
return data, labels
X, y = nonlinear_model()
clf = svm.SVC(kernel='rbf', gamma=0.001, coef0=0, degree=3)
clf.fit(X, y)
plt.figure(figsize=(6, 4))
ax = plt.subplot(1, 1, 1, xticks=[], yticks=[])
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.bone, zorder=2)
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, edgecolors="k", facecolors="none")
delta = 1
y_min, y_max = -50, 50
x_min, x_max = -50, 50
x = np.arange(x_min, x_max + delta, delta)
y = np.arange(y_min, y_max + delta, delta)
X1, X2 = np.meshgrid(x, y)
Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()])
Z = Z.reshape(X1.shape)
ax.contour(X1, X2, Z, [-1.0, 0.0, 1.0], colors='k',
linestyles=['dashed', 'solid', 'dashed'], zorder=1)
plt.show()
PK åDCKyëÎ Î plot_bias_variance.py"""
====================================
Bias and variance of polynomial fit
====================================
Demo overfitting, underfitting, and validation and learning curves with
polynomial regression.
Fit polynomes of different degrees to a dataset: for too small a degree,
the model *underfits*, while for too large a degree, it overfits.
"""
import numpy as np
import matplotlib.pyplot as plt
def generating_func(x, err=0.5):
return np.random.normal(10 - 1. / (x + 0.1), err)
############################################################
# A polynomial regression
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
############################################################
# A simple figure to illustrate the problem
n_samples = 8
np.random.seed(0)
x = 10 ** np.linspace(-2, 0, n_samples)
y = generating_func(x)
x_test = np.linspace(-0.2, 1.2, 1000)
titles = ['d = 1 (under-fit; high bias)',
'd = 2',
'd = 6 (over-fit; high variance)']
degrees = [1, 2, 6]
fig = plt.figure(figsize=(9, 3.5))
fig.subplots_adjust(left=0.06, right=0.98, bottom=0.15, top=0.85, wspace=0.05)
for i, d in enumerate(degrees):
ax = fig.add_subplot(131 + i, xticks=[], yticks=[])
ax.scatter(x, y, marker='x', c='k', s=50)
model = make_pipeline(PolynomialFeatures(d), LinearRegression())
model.fit(x[:, np.newaxis], y)
ax.plot(x_test, model.predict(x_test[:, np.newaxis]), '-b')
ax.set_xlim(-0.2, 1.2)
ax.set_ylim(0, 12)
ax.set_xlabel('house size')
if i == 0:
ax.set_ylabel('price')
ax.set_title(titles[i])
############################################################
# Generate a larger dataset
from sklearn.model_selection import train_test_split
n_samples = 200
test_size = 0.4
error = 1.0
# randomly sample the data
np.random.seed(1)
x = np.random.random(n_samples)
y = generating_func(x, error)
# split into training, validation, and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
# show the training and validation sets
plt.figure(figsize=(6, 4))
plt.scatter(x_train, y_train, color='red', label='Training set')
plt.scatter(x_test, y_test, color='blue', label='Test set')
plt.title('The data')
plt.legend(loc='best')
############################################################
# Plot a validation curve
from sklearn.model_selection import validation_curve
degrees = np.arange(1, 21)
model = make_pipeline(PolynomialFeatures(), LinearRegression())
# The parameter to vary is the "degrees" on the pipeline step
# "polynomialfeatures"
train_scores, validation_scores = validation_curve(
model, x[:, np.newaxis], y,
param_name='polynomialfeatures__degree',
param_range=degrees)
# Plot the mean train error and validation error across folds
plt.figure(figsize=(6, 4))
plt.plot(degrees, validation_scores.mean(axis=1), lw=2,
label='cross-validation')
plt.plot(degrees, train_scores.mean(axis=1), lw=2, label='training')
plt.legend(loc='best')
plt.xlabel('degree of fit')
plt.ylabel('explained variance')
plt.title('Validation curve')
plt.tight_layout()
############################################################
# Learning curves
############################################################
#
# Plot train and test error with an increasing number of samples
# A learning curve for d=1, 5, 15
for d in [1, 5, 15]:
model = make_pipeline(PolynomialFeatures(degree=d), LinearRegression())
from sklearn.model_selection import learning_curve
train_sizes, train_scores, validation_scores = learning_curve(
model, x[:, np.newaxis], y,
train_sizes=np.logspace(-1, 0, 20))
# Plot the mean train error and validation error across folds
plt.figure(figsize=(6, 4))
plt.plot(train_sizes, validation_scores.mean(axis=1),
lw=2, label='cross-validation')
plt.plot(train_sizes, train_scores.mean(axis=1),
lw=2, label='training')
plt.ylim(ymin=-.1, ymax=1)
plt.legend(loc='best')
plt.xlabel('number of train samples')
plt.ylabel('explained variance')
plt.title('Learning curve (degree=%i)' % d)
plt.tight_layout()
plt.show()
PK åDCK”p¢žj j plot_ML_flow_chart.py"""
Tutorial Diagrams
-----------------
This script plots the flow-charts used in the scikit-learn tutorials.
"""
import numpy as np
import pylab as pl
from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow
def create_base(box_bg = '#CCCCCC',
arrow1 = '#88CCFF',
arrow2 = '#88FF88',
supervised=True):
fig = pl.figure(figsize=(9, 6), facecolor='w')
ax = pl.axes((0, 0, 1, 1),
xticks=[], yticks=[], frameon=False)
ax.set_xlim(0, 9)
ax.set_ylim(0, 6)
patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
Circle((5.5, 3.5), 1.0, fc=box_bg),
Polygon([[5.5, 1.7],
[6.1, 1.1],
[5.5, 0.5],
[4.9, 1.1]], fc=box_bg),
FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
width=0.25, head_width=0.5, head_length=0.2),
FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
width=0.25, head_width=0.5, head_length=0.2),
FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
width=0.25, head_width=0.5, head_length=0.2),
FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
width=0.25, head_width=0.5, head_length=0.2),
FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
width=0.25, head_width=0.5, head_length=0.2),
FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
width=0.25, head_width=0.5, head_length=0.2)]
if supervised:
patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
width=0.25, head_width=0.5, head_length=0.2),
Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
else:
patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
for p in patches:
ax.add_patch(p)
pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
ha='center', va='center', fontsize=14)
pl.text(3.6, 4.9, "Feature\nVectors",
ha='left', va='center', fontsize=14)
pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
ha='center', va='center', fontsize=14)
pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
ha='center', va='center', fontsize=14)
pl.text(3.3, 1.7, "Feature\nVector",
ha='left', va='center', fontsize=14)
pl.text(5.5, 1.1, "Predictive\nModel",
ha='center', va='center', fontsize=12)
if supervised:
pl.text(1.45, 3.05, "Labels",
ha='center', va='center', fontsize=14)
pl.text(8.05, 1.1, "Expected\nLabel",
ha='center', va='center', fontsize=14)
pl.text(8.8, 5.8, "Supervised Learning Model",
ha='right', va='top', fontsize=18)
else:
pl.text(8.05, 1.1,
"Likelihood\nor Cluster ID\nor Better\nRepresentation",
ha='center', va='center', fontsize=12)
pl.text(8.8, 5.8, "Unsupervised Learning Model",
ha='right', va='top', fontsize=18)
def plot_supervised_chart(annotate=False):
create_base(supervised=True)
if annotate:
fontdict = dict(color='r', weight='bold', size=14)
pl.text(1.9, 4.55, 'X = vec.fit_transform(input)',
fontdict=fontdict,
rotation=20, ha='left', va='bottom')
pl.text(3.7, 3.2, 'clf.fit(X, y)',
fontdict=fontdict,
rotation=20, ha='left', va='bottom')
pl.text(1.7, 1.5, 'X_new = vec.transform(input)',
fontdict=fontdict,
rotation=20, ha='left', va='bottom')
pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
fontdict=fontdict,
rotation=20, ha='left', va='bottom')
def plot_unsupervised_chart():
create_base(supervised=False)
if __name__ == '__main__':
plot_supervised_chart(False)
plot_supervised_chart(True)
plot_unsupervised_chart()
pl.show()
PK åDCK”åFwV V ¤ plot_measuring_performance.pyPK åDCKå ÛÚd d ¤‘ plot_pca.pyPK åDCK¹@-‚Š Š ¤ plot_linear_regression.pyPK åDCKJ7rD D ¤ß
plot_iris_scatter.pyPK åDCK»‰´@ @ ¤U plot_tsne.pyPK åDCKb°3c c ¤¿ plot_linear_model_cv.pyPK åDCKxƒÛ ¤W plot_variance_linear_regr.pyPK åDCKþ’¢¡ ¡ ¤ª plot_separator.pyPK åDCK°J©w w ¤z% plot_compare_classifiers.pyPK åDCKF¶^·s s ¤*+ plot_polynomial_regression.pyPK åDCKúà « « ¤Ø2 plot_boston_prediction.pyPK åDCK¥V„¼¸ ¸ ¤º; plot_iris_knn.pyPK åDCK
Š
¤ C plot_digits_simple_classif.pyPK åDCKs{ ¤åP plot_eigenfaces.pyPK åDCKºF~
~
¤n plot_svm_non_linear.pyPK åDCKyëÎ Î ¤Ìx plot_bias_variance.pyPK åDCK”p¢žj j ¤Í‰ plot_ML_flow_chart.pyPK ƒ j