From b43cac50b0d26655a92dd2ce2b681890cc55a2af Mon Sep 17 00:00:00 2001 From: Pete Cacioppi Date: Tue, 4 Jul 2017 17:29:47 -0700 Subject: [PATCH] simple demo-the-concept notebook My point is that with a few simple code additions, BorutaPy could implement nearly the same interface as RFE (and the other predictors) and thus be more sklearn consistent. --- boruta/examples/boruta_with_more.ipynb | 649 +++++++++++++++++++++++++ 1 file changed, 649 insertions(+) create mode 100644 boruta/examples/boruta_with_more.ipynb diff --git a/boruta/examples/boruta_with_more.ipynb b/boruta/examples/boruta_with_more.ipynb new file mode 100644 index 0000000..6c7dc7a --- /dev/null +++ b/boruta/examples/boruta_with_more.ipynb @@ -0,0 +1,649 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from boruta import BorutaPy\n", + "\n", + "# load X and y\n", + "# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute\n", + "X = pd.read_csv('examples/test_X.csv', index_col=0).values\n", + "y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values\n", + "y = y.ravel()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# simple encapsulating class to demonstrate the concept\n", + "# BorutaPy would be more sklearn consistent if it implemented these routines directly\n", + "class BorutaWithMore(BorutaPy):\n", + " def fit(self, X, y):\n", + " rtn = super(BorutaWithMore, self).fit(X, y)\n", + " self.estimator.fit(self.transform(X), y)\n", + " return rtn\n", + " def score(self, X, y, sample_weight=None):\n", + " return self.estimator.score(self.transform(X), y, sample_weight)\n", + " def predict(self, X):\n", + " return self.estimator.predict(self.transform(X))\n", + " \n", + "\n", + "rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)\n", + "\n", + "# define Boruta feature selection method\n", + "feat_selector = BorutaWithMore(rf, n_estimators='auto', verbose=2, random_state=1)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t10 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t11 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t12 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t13 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t14 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t15 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t16 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t17 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t18 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t19 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t20 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t21 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t22 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t23 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t24 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t25 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t26 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t27 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t28 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t29 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t30 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t31 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t32 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t33 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t34 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t35 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/petercacioppi/anaconda/lib/python2.7/site-packages/boruta/boruta_py.py:418: RuntimeWarning: invalid value encountered in greater\n", + " hits = np.where(cur_imp[0] > imp_sha_max)[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "BorutaWithMore(alpha=0.05,\n", + " estimator=RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',\n", + " max_depth=5, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_split=1e-07, min_samples_leaf=1,\n", + " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", + " n_estimators=69, n_jobs=-1, oob_score=False,\n", + " random_state=,\n", + " verbose=0, warm_start=False),\n", + " max_iter=100, n_estimators='auto', perc=100,\n", + " random_state=,\n", + " two_step=True, verbose=2)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feat_selector.fit(train_X, train_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.98799999999999999" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feat_selector.score(test_X, test_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,\n", + " 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,\n", + " 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,\n", + " 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,\n", + " 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n", + " 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n", + " 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,\n", + " 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,\n", + " 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feat_selector.predict(test_X)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t10 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t11 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t2\n", + "Rejected: \t3\n", + "Iteration: \t12 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t13 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t10 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t11 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t12 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t13 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t14 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t15 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t16 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t17 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "Iteration: \t1 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t2 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t3 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t4 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t5 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t6 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t7 / 100\n", + "Confirmed: \t0\n", + "Tentative: \t10\n", + "Rejected: \t0\n", + "Iteration: \t8 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t9 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t10 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t11 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t12 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t13 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t14 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t15 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t1\n", + "Rejected: \t4\n", + "Iteration: \t16 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n", + "\n", + "\n", + "BorutaPy finished running.\n", + "\n", + "Iteration: \t17 / 100\n", + "Confirmed: \t5\n", + "Tentative: \t0\n", + "Rejected: \t5\n" + ] + } + ], + "source": [ + "from sklearn import model_selection\n", + "cv_score = model_selection.cross_val_score(feat_selector, X=X, y=y, cv=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1. , 0.985, 0.98 , 0.99 , 0.985])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_score" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}