scikit-learn-contrib · pjcpjc · Jul 5, 2017
diff --git a/boruta/examples/boruta_with_more.ipynb b/boruta/examples/boruta_with_more.ipynb
@@ -0,0 +1,649 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from boruta import BorutaPy\n",
+    "\n",
+    "# load X and y\n",
+    "# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute\n",
+    "X = pd.read_csv('examples/test_X.csv', index_col=0).values\n",
+    "y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values\n",
+    "y = y.ravel()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# simple encapsulating class to demonstrate the concept\n",
+    "# BorutaPy would be more sklearn consistent if it implemented these routines directly\n",
+    "class BorutaWithMore(BorutaPy):\n",
+    "    def fit(self, X, y):\n",
+    "        rtn = super(BorutaWithMore, self).fit(X, y)\n",
+    "        self.estimator.fit(self.transform(X), y)\n",
+    "        return rtn\n",
+    "    def score(self, X, y, sample_weight=None):\n",
+    "        return self.estimator.score(self.transform(X), y, sample_weight)\n",
+    "    def predict(self, X):\n",
+    "        return self.estimator.predict(self.transform(X))\n",
+    "        \n",
+    "\n",
+    "rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)\n",
+    "\n",
+    "# define Boruta feature selection method\n",
+    "feat_selector = BorutaWithMore(rf, n_estimators='auto', verbose=2, random_state=1)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t10 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t11 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t12 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t13 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t14 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t15 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t16 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t17 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t18 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t19 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t20 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t21 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t22 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t23 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t24 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t25 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t26 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t27 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t28 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t29 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t30 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t31 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t32 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t33 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t34 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t35 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/petercacioppi/anaconda/lib/python2.7/site-packages/boruta/boruta_py.py:418: RuntimeWarning: invalid value encountered in greater\n",
+      "  hits = np.where(cur_imp[0] > imp_sha_max)[0]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "BorutaWithMore(alpha=0.05,\n",
+       "        estimator=RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',\n",
+       "            max_depth=5, max_features='auto', max_leaf_nodes=None,\n",
+       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
+       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
+       "            n_estimators=69, n_jobs=-1, oob_score=False,\n",
+       "            random_state=<mtrand.RandomState object at 0x1156183c0>,\n",
+       "            verbose=0, warm_start=False),\n",
+       "        max_iter=100, n_estimators='auto', perc=100,\n",
+       "        random_state=<mtrand.RandomState object at 0x1156183c0>,\n",
+       "        two_step=True, verbose=2)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feat_selector.fit(train_X, train_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.98799999999999999"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feat_selector.score(test_X, test_y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,\n",
+       "       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,\n",
+       "       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,\n",
+       "       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,\n",
+       "       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,\n",
+       "       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
+       "       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n",
+       "       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
+       "       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,\n",
+       "       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,\n",
+       "       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feat_selector.predict(test_X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t10 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t11 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t2\n",
+      "Rejected: \t3\n",
+      "Iteration: \t12 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t13 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t10 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t11 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t12 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t13 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t14 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t15 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t16 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t17 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "Iteration: \t1 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t2 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t3 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t4 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t5 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t6 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t7 / 100\n",
+      "Confirmed: \t0\n",
+      "Tentative: \t10\n",
+      "Rejected: \t0\n",
+      "Iteration: \t8 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t9 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t10 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t11 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t12 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t13 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t14 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t15 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t1\n",
+      "Rejected: \t4\n",
+      "Iteration: \t16 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n",
+      "\n",
+      "\n",
+      "BorutaPy finished running.\n",
+      "\n",
+      "Iteration: \t17 / 100\n",
+      "Confirmed: \t5\n",
+      "Tentative: \t0\n",
+      "Rejected: \t5\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn import model_selection\n",
+    "cv_score = model_selection.cross_val_score(feat_selector, X=X, y=y, cv=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 1.   ,  0.985,  0.98 ,  0.99 ,  0.985])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv_score"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}