# TD 1 - 14 septembre 2015¶

### Introduction Python, numpy, matplotlib¶

tao.lri.fr/tiki-index.php?page=Courses : Module Apprentissage, TD 1

Python est un langage :

# III - Scikit-learn¶

## Machine Learning in Python¶

http://scikit-learn.org/

In [66]:
import sklearn # if not installed : pip install sklearn


## 1) Decision Tree¶

http://scikit-learn.org/stable/modules/tree.html

In [67]:
X = [[0, 0], [1, 1]]
y = [0, 1]
# X : n_samples x n_features - data
# y : n_samples - labels

In [68]:
from sklearn import tree
#http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
clf = tree.DecisionTreeClassifier() # clf : une instance de la classe 'DecisionTreeClassifier'

In [69]:
clf = clf.fit(X, y) # fit : s'entraine sur les donnÃ©es

In [70]:
clf.predict([[2., 2.]]) # predict : prÃ©dit le label d'une nouvelle donnÃ©e

Out[70]:
array([1])
In [71]:
clf.predict_proba([[2., 2.]]) # predict_proba : doone la distribution de probabilitÃ©

Out[71]:
array([[ 0.,  1.]])
In [72]:
clf.score(X,y) # Ã©value le score du classifier
# score = list(clf.predict(X) == y).count(True) / len(y)

Out[72]:
1.0
In [73]:
clf.feature_importances_

Out[73]:
array([ 1.,  0.])

### a) Regression¶

data : 1 dimension

target : scalaire

In [74]:
# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))

In [75]:
from sklearn import tree

clf_1 = tree.DecisionTreeRegressor()
clf_1.fit(X, y)
score_train = clf_1.score(X,y)

# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = clf_1.predict(X_test)
score_test = clf_1.score(X_test,np.sin(X_test))

# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="data")
plt.plot(X_test, y_1, c="g", label="", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression. score train = " + str(np.round(score_train,2)) +\
" score test = " + str(np.round(score_test,2)))
plt.legend()
plt.show()

In [76]:
list_score_test = []
list_score_train = []
for max_depth in np.arange(1,10):

# Fit regression model
clf_1 = tree.DecisionTreeRegressor(max_depth=max_depth)
clf_1.fit(X, y)
score_train = clf_1.score(X,y)
list_score_train.append(score_train)

# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = clf_1.predict(X_test)
score_test = clf_1.score(X_test,np.sin(X_test))
list_score_test.append(score_test)

# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="data")
plt.plot(X_test, y_1, c="g", label="max_depth="+str(max_depth), linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression. score train = " + str(np.round(score_train,2)) +\
" score test = " + str(np.round(score_test,2)))
plt.legend()
plt.show()

In [77]:
plt.plot(list_score_train, label = 'train set')
plt.plot(list_score_test, label = 'test set')
plt.xlabel('max_depth')
plt.ylabel('score')
plt.legend()

Out[77]:
<matplotlib.legend.Legend at 0x7f3dc1d94f60>

### b) Classification¶

In [78]:
# Make data
rng = np.random.RandomState(13)
d1 = np.asarray(((np.arange(10) + rng.rand(10),np.arange(10)+ rng.rand(10)))).T
d2 = np.asarray(((np.arange(3,13)+ rng.rand(10),np.arange(10)+ rng.rand(10)))).T
X = np.vstack((d1,d2))
y = [0] * d1.shape[0] + [1] * d2.shape[0]
plt.scatter(X[:,0],X[:,1], c = y, s = 50)

Out[78]:
<matplotlib.collections.PathCollection at 0x7f3dc1f5f780>
In [79]:
# Ne pas s'attarder sur cette fonction pour le moment.
def plot_boundary(X, label = None):
# Affiche la carte avec les zones de chaque catÃ©gorie

plt.figure()
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.scatter(X[:,0],X[:,1], c = label)

In [80]:
# fait varier max_depth
for max_depth in np.arange(1,10):
clf = tree.DecisionTreeClassifier(max_depth=max_depth)
clf = clf.fit(X, y)
plot_boundary(X,y)
plt.title('score = ' + str(clf.score(X,y)))
#print(clf.score(X,y))
if clf.score(X,y) == 1.:
break


### Jouer avec les paramètres de : DecisionTreeClassifier¶

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [90]:
clf  =tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,\
min_samples_leaf=1,\
max_features=None, random_state=None, max_leaf_nodes=None)

In [ ]:


In [ ]:


In [ ]:


In [ ]:



### Autre data¶

In [81]:
from sklearn.datasets import make_moons
X,y = make_moons(n_samples=300, noise=.1, random_state=12)

/home/thomas/anaconda3/lib/python3.4/site-packages/sklearn/datasets/samples_generator.py:612: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
y = np.hstack([np.zeros(n_samples_in, dtype=np.intp),

In [82]:
plt.scatter(X[:,0],X[:,1], c = y, s=90)

Out[82]:
<matplotlib.collections.PathCollection at 0x7f3dc0d18780>
In [83]:
# fait varier max_depth
for max_depth in np.arange(1,10):
clf = tree.DecisionTreeClassifier(max_depth=max_depth)
clf = clf.fit(X, y)
plot_boundary(X,y)
plt.title('score = ' + str(clf.score(X,y)))
#print(clf.score(X,y))
if clf.score(X,y) == 1.:
break


### Multi class¶

In [ ]:


In [84]:
# Make data
rng = np.random.RandomState(13)
d1 = np.asarray(((np.arange(10) + rng.rand(10),np.arange(10)+ rng.rand(10)))).T
d2 = np.asarray(((np.arange(7,17)+ rng.rand(10),np.arange(10)+ rng.rand(10)))).T
d3 = np.asarray(((np.arange(3,13)+ rng.rand(10),np.arange(10)+ rng.rand(10)))).T
X = np.vstack((d1,d2,d3))
y = [0] * d1.shape[0] + [1] * d2.shape[0] + [2] * d3.shape[0]
plt.scatter(X[:,0],X[:,1], c = y, s = 50)

Out[84]:
<matplotlib.collections.PathCollection at 0x7f3dc1a94588>
In [85]:
# fait varier max_depth
for max_depth in np.arange(1,10):
clf = tree.DecisionTreeClassifier(max_depth=max_depth)
clf = clf.fit(X, y)
plot_boundary(X,y)
plt.title('score = ' + str(clf.score(X,y)))
#print(clf.score(X,y))
if clf.score(X,y) == 1.:
break

In [ ]:


In [ ]: