Skip to content

Commit 8b52a5b

Browse files
author
shreeshan
committed
initial commit - woe and iv
0 parents  commit 8b52a5b

File tree

4 files changed

+273
-0
lines changed

4 files changed

+273
-0
lines changed

.gitignore

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Created by .ignore support plugin (hsz.mobi)
2+
### macOS template
3+
# General
4+
.DS_Store
5+
.AppleDouble
6+
.LSOverride
7+
8+
# Icon must end with two \r
9+
Icon
10+
11+
# Thumbnails
12+
._*
13+
14+
# Files that might appear in the root of a volume
15+
.DocumentRevisions-V100
16+
.fseventsd
17+
.Spotlight-V100
18+
.TemporaryItems
19+
.Trashes
20+
.VolumeIcon.icns
21+
.com.apple.timemachine.donotpresent
22+
23+
# Directories potentially created on remote AFP share
24+
.AppleDB
25+
.AppleDesktop
26+
Network Trash Folder
27+
Temporary Items
28+
.apdisk
29+
### Example user template template
30+
### Example user template
31+
32+
# IntelliJ project files
33+
.idea
34+
*.iml
35+
out
36+
gen### Python template
37+
# Byte-compiled / optimized / DLL files
38+
__pycache__/
39+
*.py[cod]
40+
*$py.class
41+
42+
# C extensions
43+
*.so
44+
45+
# Distribution / packaging
46+
.Python
47+
build/
48+
develop-eggs/
49+
dist/
50+
downloads/
51+
eggs/
52+
.eggs/
53+
lib/
54+
lib64/
55+
parts/
56+
sdist/
57+
var/
58+
wheels/
59+
*.egg-info/
60+
.installed.cfg
61+
*.egg
62+
MANIFEST
63+
64+
# PyInstaller
65+
# Usually these files are written by a python script from a template
66+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
67+
*.manifest
68+
*.spec
69+
70+
# Installer logs
71+
pip-log.txt
72+
pip-delete-this-directory.txt
73+
74+
# Unit test / coverage reports
75+
htmlcov/
76+
.tox/
77+
.coverage
78+
.coverage.*
79+
.cache
80+
nosetests.xml
81+
coverage.xml
82+
*.cover
83+
.hypothesis/
84+
.pytest_cache/
85+
86+
# Translations
87+
*.mo
88+
*.pot
89+
90+
# Django stuff:
91+
*.log
92+
local_settings.py
93+
db.sqlite3
94+
95+
# Flask stuff:
96+
instance/
97+
.webassets-cache
98+
99+
# Scrapy stuff:
100+
.scrapy
101+
102+
# Sphinx documentation
103+
docs/_build/
104+
105+
# PyBuilder
106+
target/
107+
108+
# Jupyter Notebook
109+
.ipynb_checkpoints
110+
111+
# pyenv
112+
.python-version
113+
114+
# celery beat schedule file
115+
celerybeat-schedule
116+
117+
# SageMath parsed files
118+
*.sage.py
119+
120+
# Environments
121+
.env
122+
.venv
123+
env/
124+
venv/
125+
ENV/
126+
env.bak/
127+
venv.bak/
128+
129+
# Spyder project settings
130+
.spyderproject
131+
.spyproject
132+
133+
# Rope project settings
134+
.ropeproject
135+
136+
# mkdocs documentation
137+
/site
138+
139+
# mypy
140+
.mypy_cache/
141+

algorithms_src/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@created on: 27/03/19,
4+
@author: Shreesha N,
5+
@version: v0.0.1
6+
7+
Description:
8+
9+
Sphinx Documentation Status:
10+
11+
..todo::
12+
13+
"""

algorithms_src/woe_iv_score.md

Whitespace-only changes.

algorithms_src/woe_iv_score.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@created on: 27/03/19,
4+
@author: Shreesha N,
5+
@version: v0.0.1
6+
7+
Description:
8+
9+
Sphinx Documentation Status:
10+
11+
..todo::
12+
13+
"""
14+
15+
import numpy as np
16+
import pandas as pd
17+
import traceback
18+
19+
pd.set_option('display.max_rows', 1000)
20+
pd.set_option('display.max_columns', 1000)
21+
22+
23+
def encode_continuous_column(data_column, category_count=10):
24+
"""
25+
Converts a continuous column into categorical based on category_count value
26+
:param data_column: structure containing continuous data
27+
:param category_count: number of buckets to create
28+
:return: encoded column from continuous column to categorical column
29+
"""
30+
encoded_column = pd.cut(data_column, category_count, labels=['cat_' + str(x) for x in range(category_count)])
31+
return encoded_column
32+
33+
34+
def calculate_woe(data, independent_var, dependent_var, is_continuous=None, category_count=10):
35+
"""
36+
Calculates weight of evidence of a independent variable against a dependent variable
37+
:param data: dataframe which contains feature a
38+
:param independent_var: variable whose woe needs to be calculated
39+
:param dependent_var: target variable
40+
:param is_continuous: Default None; Boolean indicating whether the independent_var passed in categorical or continuous
41+
:param category_count: Default 10; If the independent variable is continuous, this parameter defines the number of categories to derive from the variable
42+
:return: dictionary containing woe and iv scores under key 'woe' and 'iv 'of the independent variable
43+
"""
44+
# calculate total number of positive and negative samples in data
45+
total_bads = data[dependent_var].sum()
46+
total_goods = len(data) - total_bads
47+
if total_bads == 0 or total_goods == 0:
48+
raise Exception('Target variable does not contain two classes. ')
49+
50+
# check if column is continuous, if yes convert it to bucketize
51+
if is_continuous:
52+
data[independent_var] = encode_continuous_column(data[independent_var], category_count=category_count)
53+
elif data[independent_var].dtype == np.float:
54+
data[independent_var] = encode_continuous_column(data[independent_var], category_count=category_count)
55+
56+
# pivot on independent variable to get counts of goods and bads
57+
pivot = pd.pivot_table(data, index=independent_var, columns=dependent_var, aggfunc='count')
58+
feature_uniques = data[independent_var].unique()
59+
60+
# dictionary to hold values required for iv calculation
61+
values = {'category': [], 'goods_count': [], 'bads_count': [], 'goods_percentage': [], 'bads_percentage': [],
62+
'woe': [], 'iv': []}
63+
64+
# iterate over all the unique categories in the independent variable
65+
for f in feature_uniques:
66+
values['category'].append(f)
67+
68+
goods_count = pivot.loc[f][0]
69+
values['goods_count'].append(goods_count)
70+
71+
bads_count = pivot.loc[f][1]
72+
values['bads_count'].append(bads_count)
73+
74+
goods_percentage = goods_count / total_goods
75+
values['goods_percentage'].append(goods_percentage)
76+
77+
bads_percentage = bads_count / total_bads
78+
values['bads_percentage'].append(bads_percentage)
79+
80+
woe = np.log(goods_percentage / bads_percentage)
81+
values['woe'].append(woe)
82+
83+
iv = (woe * (goods_percentage - bads_percentage))
84+
values['iv'].append(iv)
85+
return values
86+
87+
88+
def calculate_iv(data, independent_var, dependent_var, is_continuous=None, category_count=10):
89+
"""
90+
This function assumes the data passed is treated for null values and any other irregularities
91+
Calculates information value of a independent variable against a dependent variable
92+
:param data: dataframe which contains feature a
93+
:param independent_var: variable whose IV needs to be calculated
94+
:param dependent_var: target variable
95+
:param is_continuous: Default None; Boolean indicating whether the independent_var passed in categorical or continuous
96+
:param category_count: Default 10; If the independent variable is continuous, this parameter defines the number of categories to derive from the variable
97+
:return: iv score of the independent variable
98+
"""
99+
try:
100+
values = calculate_woe(data, independent_var, dependent_var, is_continuous, category_count)
101+
df = pd.DataFrame(values)
102+
return df['iv'].sum()
103+
except Exception:
104+
traceback.print_exc()
105+
106+
107+
if __name__ == '__main__':
108+
iv_scores = {}
109+
csv_filepath = ''
110+
data = pd.read_csv(csv_filepath)
111+
data = data.fillna(0)
112+
target_column = 'Actual Label'
113+
id_column = 'Customer_ID'
114+
cols_to_calculate_iv = [x for x in data.columns if x not in [target_column, id_column]]
115+
for col in cols_to_calculate_iv:
116+
print(col)
117+
iv_score = calculate_iv(data, col, target_column)
118+
iv_scores[col] = iv_score
119+
print(iv_scores)

0 commit comments

Comments
 (0)