{ "cells": [ { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "import numpy as np\n", "\n", "# We load the data with load_iris from sklearn\n", "from sklearn.datasets import load_iris\n", "\n", "data = load_iris()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# load_iris returns an object with several fields\n", "features = data.data\n", "feature_names = data.feature_names\n", "target = data.target\n", "target_names = data.target_names\n", "for t in range(3):\n", " if t == 0:\n", " c = 'r'\n", " marker = '>'\n", " elif t == 1:\n", " c = 'g'\n", " marker = 'o'\n", " elif t == 2:\n", " c = 'b'\n", " marker = 'x'\n", " plt.scatter(features[target == t,0],\n", " features[target == t,1],\n", " marker=marker,\n", " c=c)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Maximum of setosa: 1.9.\n", "Minimum of others: 3.0.\n" ] } ], "source": [ "# We use NumPy fancy indexing to get an array of strings:\n", "labels = target_names[target]\n", "\n", "# The petal length is the feature at position 2\n", "plength = features[:, 2]\n", "\n", "# Build an array of booleans:\n", "is_setosa = (labels == 'setosa')\n", "\n", "# This is the important step:\n", "max_setosa = plength[is_setosa].max()\n", "min_non_setosa = plength[~is_setosa].min()\n", "print('Maximum of setosa: {0}.'.format(max_setosa))\n", "print('Minimum of others: {0}.'.format(min_non_setosa))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ~ is the boolean negation operator\n", "features = features[~is_setosa]\n", "labels = labels[~is_setosa]\n", "# Build a new target variable, is_virginica\n", "is_virginica = (labels == 'virginica')\n", "# print(is_virginica)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best acc: 0.94, fi: 3, t: 1.6, reverse: False\n" ] } ], "source": [ "# Check which feature + threshold results in best classification \n", "# accuracy\n", "\n", "# Initialize best_acc to impossibly low value\n", "best_acc = -1.0\n", "\n", "for fi in range(features.shape[1]):\n", " # We are going to test all possible thresholds\n", " # print(\"Testing %s\" % fi)\n", " thresh = features[:,fi]\n", " for t in thresh:\n", " # Get the vector for feature `fi`\n", " feature_i = features[:, fi]\n", "\n", " # apply threshold `t`\n", " pred = (feature_i > t)\n", " acc = (pred == is_virginica).mean()\n", " rev_acc = (pred == ~is_virginica).mean()\n", "\n", " if rev_acc > acc:\n", " reverse = True\n", " acc = rev_acc\n", " else:\n", " reverse = False\n", " \n", " if acc > best_acc:\n", " best_acc = acc\n", " best_fi = fi\n", " best_t = t\n", " best_reverse = reverse\n", " \n", "print(\"Best acc: %s, fi: %s, t: %s, reverse: %s\" \n", " % (best_acc, best_fi, best_t, best_reverse)) " ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "True\n", "True\n" ] } ], "source": [ "#print(features[1, :])\n", "#And now we have our classification method\n", "def is_virginica_test(fi, t, reverse, example):\n", " \"Apply threshold model to a new example\"\n", " test = example[fi] > t\n", " if reverse:\n", " test = not test\n", " return test" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }