|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +@created on: 27/03/19, |
| 4 | +@author: Shreesha N, |
| 5 | +@version: v0.0.1 |
| 6 | +
|
| 7 | +Description: |
| 8 | +
|
| 9 | +Sphinx Documentation Status: |
| 10 | +
|
| 11 | +..todo:: |
| 12 | +
|
| 13 | +""" |
| 14 | + |
| 15 | +import numpy as np |
| 16 | +import pandas as pd |
| 17 | +import traceback |
| 18 | + |
| 19 | +pd.set_option('display.max_rows', 1000) |
| 20 | +pd.set_option('display.max_columns', 1000) |
| 21 | + |
| 22 | + |
| 23 | +def encode_continuous_column(data_column, category_count=10): |
| 24 | + """ |
| 25 | + Converts a continuous column into categorical based on category_count value |
| 26 | + :param data_column: structure containing continuous data |
| 27 | + :param category_count: number of buckets to create |
| 28 | + :return: encoded column from continuous column to categorical column |
| 29 | + """ |
| 30 | + encoded_column = pd.cut(data_column, category_count, labels=['cat_' + str(x) for x in range(category_count)]) |
| 31 | + return encoded_column |
| 32 | + |
| 33 | + |
| 34 | +def calculate_woe(data, independent_var, dependent_var, is_continuous=None, category_count=10): |
| 35 | + """ |
| 36 | + Calculates weight of evidence of a independent variable against a dependent variable |
| 37 | + :param data: dataframe which contains feature a |
| 38 | + :param independent_var: variable whose woe needs to be calculated |
| 39 | + :param dependent_var: target variable |
| 40 | + :param is_continuous: Default None; Boolean indicating whether the independent_var passed in categorical or continuous |
| 41 | + :param category_count: Default 10; If the independent variable is continuous, this parameter defines the number of categories to derive from the variable |
| 42 | + :return: dictionary containing woe and iv scores under key 'woe' and 'iv 'of the independent variable |
| 43 | + """ |
| 44 | + # calculate total number of positive and negative samples in data |
| 45 | + total_bads = data[dependent_var].sum() |
| 46 | + total_goods = len(data) - total_bads |
| 47 | + if total_bads == 0 or total_goods == 0: |
| 48 | + raise Exception('Target variable does not contain two classes. ') |
| 49 | + |
| 50 | + # check if column is continuous, if yes convert it to bucketize |
| 51 | + if is_continuous: |
| 52 | + data[independent_var] = encode_continuous_column(data[independent_var], category_count=category_count) |
| 53 | + elif data[independent_var].dtype == np.float: |
| 54 | + data[independent_var] = encode_continuous_column(data[independent_var], category_count=category_count) |
| 55 | + |
| 56 | + # pivot on independent variable to get counts of goods and bads |
| 57 | + pivot = pd.pivot_table(data, index=independent_var, columns=dependent_var, aggfunc='count') |
| 58 | + feature_uniques = data[independent_var].unique() |
| 59 | + |
| 60 | + # dictionary to hold values required for iv calculation |
| 61 | + values = {'category': [], 'goods_count': [], 'bads_count': [], 'goods_percentage': [], 'bads_percentage': [], |
| 62 | + 'woe': [], 'iv': []} |
| 63 | + |
| 64 | + # iterate over all the unique categories in the independent variable |
| 65 | + for f in feature_uniques: |
| 66 | + values['category'].append(f) |
| 67 | + |
| 68 | + goods_count = pivot.loc[f][0] |
| 69 | + values['goods_count'].append(goods_count) |
| 70 | + |
| 71 | + bads_count = pivot.loc[f][1] |
| 72 | + values['bads_count'].append(bads_count) |
| 73 | + |
| 74 | + goods_percentage = goods_count / total_goods |
| 75 | + values['goods_percentage'].append(goods_percentage) |
| 76 | + |
| 77 | + bads_percentage = bads_count / total_bads |
| 78 | + values['bads_percentage'].append(bads_percentage) |
| 79 | + |
| 80 | + woe = np.log(goods_percentage / bads_percentage) |
| 81 | + values['woe'].append(woe) |
| 82 | + |
| 83 | + iv = (woe * (goods_percentage - bads_percentage)) |
| 84 | + values['iv'].append(iv) |
| 85 | + return values |
| 86 | + |
| 87 | + |
| 88 | +def calculate_iv(data, independent_var, dependent_var, is_continuous=None, category_count=10): |
| 89 | + """ |
| 90 | + This function assumes the data passed is treated for null values and any other irregularities |
| 91 | + Calculates information value of a independent variable against a dependent variable |
| 92 | + :param data: dataframe which contains feature a |
| 93 | + :param independent_var: variable whose IV needs to be calculated |
| 94 | + :param dependent_var: target variable |
| 95 | + :param is_continuous: Default None; Boolean indicating whether the independent_var passed in categorical or continuous |
| 96 | + :param category_count: Default 10; If the independent variable is continuous, this parameter defines the number of categories to derive from the variable |
| 97 | + :return: iv score of the independent variable |
| 98 | + """ |
| 99 | + try: |
| 100 | + values = calculate_woe(data, independent_var, dependent_var, is_continuous, category_count) |
| 101 | + df = pd.DataFrame(values) |
| 102 | + return df['iv'].sum() |
| 103 | + except Exception: |
| 104 | + traceback.print_exc() |
| 105 | + |
| 106 | + |
| 107 | +if __name__ == '__main__': |
| 108 | + iv_scores = {} |
| 109 | + csv_filepath = '' |
| 110 | + data = pd.read_csv(csv_filepath) |
| 111 | + data = data.fillna(0) |
| 112 | + target_column = 'Actual Label' |
| 113 | + id_column = 'Customer_ID' |
| 114 | + cols_to_calculate_iv = [x for x in data.columns if x not in [target_column, id_column]] |
| 115 | + for col in cols_to_calculate_iv: |
| 116 | + print(col) |
| 117 | + iv_score = calculate_iv(data, col, target_column) |
| 118 | + iv_scores[col] = iv_score |
| 119 | + print(iv_scores) |
0 commit comments