|
| 1 | +import pandas as pd |
| 2 | + |
| 3 | +# Load dataset (replace 'population.csv' with the path to your file) |
| 4 | +df = pd.read_csv('population.csv') |
| 5 | + |
| 6 | +# 1. Inspect the Data |
| 7 | +print("First 5 rows of the dataset:") |
| 8 | +print(df.head()) |
| 9 | + |
| 10 | +print("\nBasic Information:") |
| 11 | +print(df.info()) |
| 12 | + |
| 13 | +print("\nSummary Statistics:") |
| 14 | +print(df.describe()) |
| 15 | + |
| 16 | +# 2. Check for Missing Data |
| 17 | +print("\nMissing Values per Column:") |
| 18 | +print(df.isnull().sum()) |
| 19 | + |
| 20 | +# 3. Handle Missing Data (Example: Fill with mean or drop) |
| 21 | +df_cleaned = df.fillna(df.mean()) # Fill missing values with column means |
| 22 | +# Alternatively, drop rows with missing values |
| 23 | +# df_cleaned = df.dropna() |
| 24 | + |
| 25 | +# 4. Data Analysis - Grouping and Aggregation |
| 26 | +# Example: Group by a column and calculate the mean of other columns |
| 27 | +grouped_data = df_cleaned.groupby('Category').mean() |
| 28 | +print("\nMean values by Category:") |
| 29 | +print(grouped_data) |
| 30 | + |
| 31 | +# 5. Filter Data |
| 32 | +# Example: Filter rows where a column 'Sales' is greater than 500 |
| 33 | +filtered_data = df_cleaned[df_cleaned['Sales'] > 500] |
| 34 | +print("\nFiltered Data (Sales > 500):") |
| 35 | +print(filtered_data) |
| 36 | + |
| 37 | +# 6. Correlation Analysis |
| 38 | +print("\nCorrelation between numerical columns:") |
| 39 | +print(df_cleaned.corr()) |
| 40 | + |
| 41 | +# 7. Save the cleaned data to a new CSV file |
| 42 | +df_cleaned.to_csv('cleaned_data.csv', index=False) |
| 43 | + |
| 44 | +# Optional: Plotting (if you want to visualize the data) |
| 45 | +import matplotlib.pyplot as plt |
| 46 | +df_cleaned['Sales'].hist(bins=20) |
| 47 | +plt.title('Sales Distribution') |
| 48 | +plt.xlabel('Sales') |
| 49 | +plt.ylabel('Frequency') |
| 50 | +plt.show() |
0 commit comments