diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py index c01a1f2111a..4e63f4d2967 100644 --- a/feature_selection/find_signature.py +++ b/feature_selection/find_signature.py @@ -19,8 +19,8 @@ ### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier -from sklearn import cross_validation -features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) +from sklearn import model_selection +features_train, features_test, labels_train, labels_test = model_selection.train_test_split(word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py index d509cd9f22f..789eb7c139a 100644 --- a/outliers/outlier_removal_regression.py +++ b/outliers/outlier_removal_regression.py @@ -20,7 +20,7 @@ ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py index 074b860a253..b9ad1ccb9f2 100644 --- a/pca/eigenfaces.py +++ b/pca/eigenfaces.py @@ -23,12 +23,12 @@ import pylab as pl import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_lfw_people -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix -from sklearn.decomposition import RandomizedPCA +from sklearn.decomposition import PCA from sklearn.svm import SVC # Display progress logs on stdout @@ -70,7 +70,10 @@ print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() -pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) +pca = PCA( + n_components=n_components, + svd_solver='randomized', + whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w)) diff --git a/tools/startup.py b/tools/startup.py index 4638e0d115e..c3d2bc521ab 100644 --- a/tools/startup.py +++ b/tools/startup.py @@ -32,7 +32,22 @@ print "download will complete at about 423 MB" import urllib url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz" -urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") +filename = "../enron_mail_20150507.tar.gz" +try: + urllib.urlretrieve(url, filename=filename) +except IOError as socket_error: + expected_error = ( + "IOError('socket error', SSLError(1, u'[SSL: DH_KEY_TOO_SMALL]"+ + " dh key too small (_ssl.c:727)'))" + ) + if repr(socket_error) == expected_error: + import ssl + cipher = "ECDHE-RSA-AES128-GCM-SHA256" + context = ssl.create_default_context() + context.set_ciphers(cipher) + urllib.urlretrieve(url, filename=filename, context=context) + else: + raise socket_error print "download complete!"