Skip to content

simple demo-the-concept notebook #27

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
649 changes: 649 additions & 0 deletions boruta/examples/boruta_with_more.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,649 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from boruta import BorutaPy\n",
"\n",
"# load X and y\n",
"# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute\n",
"X = pd.read_csv('examples/test_X.csv', index_col=0).values\n",
"y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values\n",
"y = y.ravel()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# simple encapsulating class to demonstrate the concept\n",
"# BorutaPy would be more sklearn consistent if it implemented these routines directly\n",
"class BorutaWithMore(BorutaPy):\n",
" def fit(self, X, y):\n",
" rtn = super(BorutaWithMore, self).fit(X, y)\n",
" self.estimator.fit(self.transform(X), y)\n",
" return rtn\n",
" def score(self, X, y, sample_weight=None):\n",
" return self.estimator.score(self.transform(X), y, sample_weight)\n",
" def predict(self, X):\n",
" return self.estimator.predict(self.transform(X))\n",
" \n",
"\n",
"rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5)\n",
"\n",
"# define Boruta feature selection method\n",
"feat_selector = BorutaWithMore(rf, n_estimators='auto', verbose=2, random_state=1)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t10 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t11 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t12 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t13 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t14 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t15 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t16 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t17 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t18 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t19 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t20 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t21 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t22 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t23 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t24 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t25 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t26 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t27 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t28 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t29 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t30 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t31 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t32 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t33 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t34 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t35 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/petercacioppi/anaconda/lib/python2.7/site-packages/boruta/boruta_py.py:418: RuntimeWarning: invalid value encountered in greater\n",
" hits = np.where(cur_imp[0] > imp_sha_max)[0]\n"
]
},
{
"data": {
"text/plain": [
"BorutaWithMore(alpha=0.05,\n",
" estimator=RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',\n",
" max_depth=5, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_split=1e-07, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=69, n_jobs=-1, oob_score=False,\n",
" random_state=<mtrand.RandomState object at 0x1156183c0>,\n",
" verbose=0, warm_start=False),\n",
" max_iter=100, n_estimators='auto', perc=100,\n",
" random_state=<mtrand.RandomState object at 0x1156183c0>,\n",
" two_step=True, verbose=2)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feat_selector.fit(train_X, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.98799999999999999"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feat_selector.score(test_X, test_y)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,\n",
" 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,\n",
" 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,\n",
" 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,\n",
" 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,\n",
" 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
" 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n",
" 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
" 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,\n",
" 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,\n",
" 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feat_selector.predict(test_X)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t10 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t11 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t2\n",
"Rejected: \t3\n",
"Iteration: \t12 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t13 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t10 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t11 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t12 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t13 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t14 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t15 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t16 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t17 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"Iteration: \t1 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t2 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t3 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t4 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t5 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t6 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t7 / 100\n",
"Confirmed: \t0\n",
"Tentative: \t10\n",
"Rejected: \t0\n",
"Iteration: \t8 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t9 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t10 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t11 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t12 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t13 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t14 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t15 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t1\n",
"Rejected: \t4\n",
"Iteration: \t16 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n",
"\n",
"\n",
"BorutaPy finished running.\n",
"\n",
"Iteration: \t17 / 100\n",
"Confirmed: \t5\n",
"Tentative: \t0\n",
"Rejected: \t5\n"
]
}
],
"source": [
"from sklearn import model_selection\n",
"cv_score = model_selection.cross_val_score(feat_selector, X=X, y=y, cv=5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1. , 0.985, 0.98 , 0.99 , 0.985])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv_score"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}