# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License # # This script trains tries to tweak hyperparameters to improve P/R AUC # import time start_time = time.time() import numpy as np from sklearn.metrics import precision_recall_curve, roc_curve, auc from sklearn.cross_validation import ShuffleSplit from utils import plot_pr from utils import load_sanders_data from utils import tweak_labels from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.metrics import f1_score from sklearn.naive_bayes import MultinomialNB phase = "02" def create_ngram_model(params=None): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, "english"], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print(clf) return clf def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors) def print_incorrect(clf, X, Y): Y_hat = clf.predict(X) wrong_idx = Y_hat != Y X_wrong = X[wrong_idx] Y_wrong = Y[wrong_idx] Y_hat_wrong = Y_hat[wrong_idx] for idx in range(len(X_wrong)): print("clf.predict('%s')=%i instead of %i" % (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])) def get_best_model(): best_params = dict(vect__ngram_range=(1, 2), vect__min_df=1, vect__stop_words=None, vect__smooth_idf=False, vect__use_idf=False, vect__sublinear_tf=True, vect__binary=False, clf__alpha=0.01, ) best_clf = create_ngram_model(best_params) return best_clf if __name__ == "__main__": X_orig, Y_orig = load_sanders_data() classes = np.unique(Y_orig) for c in classes: print("#%s: %i" % (c, sum(Y_orig == c))) print("== Pos vs. neg ==") pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") X = X_orig[pos_neg] Y = Y_orig[pos_neg] Y = tweak_labels(Y, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print("== Pos/neg vs. irrelevant/neutral ==") X = X_orig Y = tweak_labels(Y_orig, ["positive", "negative"]) # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs # rest", plot=True) train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) print("== Pos vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["positive"]) train_model(get_best_model(), X, Y, name="pos vs rest", plot=True) print("== Neg vs. rest ==") X = X_orig Y = tweak_labels(Y_orig, ["negative"]) train_model(get_best_model(), X, Y, name="neg vs rest", plot=True) print("time spent:", time.time() - start_time)