Skip to content

Commit cc2694b

Browse files
committed
Add files via upload
1 parent 3047588 commit cc2694b

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed

feature_distribution_analysis.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""
2+
Feature Distribution Analysis for Machine Learning
3+
===================================================
4+
This script demonstrates how to analyze and visualize feature distributions
5+
to understand whether the data is symmetric or skewed. It includes:
6+
1. Histogram and KDE plots
7+
2. Box plots for outliers
8+
3. Skewness calculation for numerical features
9+
4. Recommendations for imputation strategies based on distribution
10+
11+
"""
12+
13+
import pandas as pd
14+
import numpy as np
15+
import seaborn as sns
16+
import matplotlib.pyplot as plt
17+
from scipy.stats import skew
18+
19+
# ===========================
20+
# 1. Sample Dataset Creation
21+
# ===========================
22+
23+
data = {
24+
"Age": [25, 32, 40, 47, 52, 29, 31, 45, 38, np.nan],
25+
"Salary": [50000, 60000, 52000, 75000, 80000, 62000, 62000, 70000, np.nan, 58000],
26+
"House_Price": [150000, 200000, 210000, 450000, 400000, 250000, 240000, 500000, 300000, 310000],
27+
}
28+
29+
df = pd.DataFrame(data)
30+
print("Original Dataset:\n", df)
31+
32+
33+
# ============================
34+
# 2. Visualizing Distributions
35+
# ============================
36+
37+
def plot_distribution_and_skew(df):
38+
"""
39+
Function to plot distributions (Histogram + KDE) and Box Plots
40+
for all numerical columns in a DataFrame. Calculates skewness.
41+
42+
Parameters:
43+
df (pd.DataFrame): The DataFrame containing numerical features.
44+
"""
45+
for column in df.select_dtypes(include=[np.number]).columns:
46+
plt.figure(figsize=(12, 6))
47+
48+
# Histogram + KDE Plot
49+
sns.histplot(df[column], kde=True, bins=10, color='blue', stat='density')
50+
plt.title(f"Distribution of {column}", fontsize=16)
51+
plt.xlabel(column, fontsize=12)
52+
plt.ylabel("Density", fontsize=12)
53+
plt.grid(axis='y', alpha=0.75)
54+
55+
# Box Plot for Outliers
56+
plt.figure(figsize=(6, 4))
57+
sns.boxplot(x=df[column], color='orange')
58+
plt.title(f"Box Plot of {column}", fontsize=16)
59+
plt.xlabel(column, fontsize=12)
60+
plt.show()
61+
62+
# Skewness Calculation
63+
column_skew = skew(df[column].dropna()) # Drop NaN for skewness calculation
64+
print(f"{column} Skewness: {column_skew:.2f}")
65+
if column_skew > 0.5:
66+
print(f"{column} is positively skewed.\n")
67+
elif column_skew < -0.5:
68+
print(f"{column} is negatively skewed.\n")
69+
else:
70+
print(f"{column} is approximately symmetric.\n")
71+
72+
73+
# Call the function
74+
plot_distribution_and_skew(df)
75+
76+
# ============================
77+
# 3. Handling Missing Values
78+
# ============================
79+
80+
print("\n--- Handling Missing Values ---")
81+
82+
# Strategy based on distribution
83+
for column in df.columns:
84+
if column == "Age":
85+
# Age is approximately symmetric (use mean imputation)
86+
df[column].fillna(df[column].mean(), inplace=True)
87+
print(f"{column}: Mean imputation applied.")
88+
elif column == "Salary":
89+
# Salary is slightly skewed (use median imputation)
90+
df[column].fillna(df[column].median(), inplace=True)
91+
print(f"{column}: Median imputation applied.")
92+
93+
print("\nAfter Imputation:\n", df)
94+
95+
# ============================
96+
# 4. Recommendations Summary
97+
# ============================
98+
99+
print("\n--- Recommendations Summary ---")
100+
for column in df.columns:
101+
column_skew = skew(df[column]) if column in df.select_dtypes(include=[np.number]).columns else None
102+
if column_skew is not None:
103+
if column_skew > 0.5:
104+
print(f"{column}: Positively skewed. Consider using median imputation or transformations (e.g., log).")
105+
elif column_skew < -0.5:
106+
print(f"{column}: Negatively skewed. Consider using median imputation.")
107+
else:
108+
print(f"{column}: Symmetric. Mean imputation is suitable.")
109+
else:
110+
print(f"{column}: Non-numerical or categorical data.")
111+
112+
# ============================
113+
# 5. Final Dataset
114+
# ============================
115+
116+
print("\nFinal Dataset After Analysis and Imputation:\n", df)
117+
118+
"""
119+
Output Insights:
120+
----------------
121+
1. Distributions:
122+
- Histograms and KDE plots provide visual insights into the shape of the data.
123+
- Box plots help identify outliers.
124+
2. Skewness:
125+
- Symmetric (Skewness ≈ 0): Use mean for imputation.
126+
- Positively skewed (Skewness > 0.5): Use median for imputation or log transformation.
127+
- Negatively skewed (Skewness < -0.5): Use median for imputation.
128+
3. Handling Missing Values:
129+
- Imputation strategy depends on distribution shape.
130+
4. Recommendations:
131+
- Adjust feature scaling or transformations for better model performance.
132+
133+
"""

0 commit comments

Comments
 (0)