AdeyanjuTeslim Home About Me Portfolio Resume Email Me
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_wine
import warnings
warnings.filterwarnings('ignore')
def explain_iqr_mathematics():
"""
Demonstrate the mathematical foundation of IQR
"""
# Generate sample data
np.random.seed(42)
normal_data = np.random.normal(100, 15, 1000)
# Calculate quartiles
Q1 = np.percentile(normal_data, 25)
Q2 = np.percentile(normal_data, 50) # Median
Q3 = np.percentile(normal_data, 75)
IQR = Q3 - Q1
# Calculate bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("=== IQR MATHEMATICAL BREAKDOWN ===")
print(f"Q1 (25th percentile): {Q1:.2f}")
print(f"Q2 (50th percentile/Median): {Q2:.2f}")
print(f"Q3 (75th percentile): {Q3:.2f}")
print(f"IQR (Q3 - Q1): {IQR:.2f}")
print(f"Lower Bound (Q1 - 1.5×IQR): {lower_bound:.2f}")
print(f"Upper Bound (Q3 + 1.5×IQR): {upper_bound:.2f}")
# Count outliers
outliers = (normal_data < lower_bound) | (normal_data > upper_bound)
print(f"Outliers detected: {outliers.sum()} ({(outliers.sum()/len(normal_data))*100:.2f}%)")
# Visualize
plt.figure(figsize=(12, 8))
# Box plot
plt.subplot(2, 2, 1)
box_plot = plt.boxplot(normal_data, patch_artist=True, labels=['Data'])
box_plot['boxes'][0].set_facecolor('lightblue')
plt.title('Box Plot Showing IQR Components')
plt.ylabel('Value')
# Histogram with quartiles
plt.subplot(2, 2, 2)
plt.hist(normal_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(Q1, color='red', linestyle='--', label=f'Q1: {Q1:.1f}')
plt.axvline(Q2, color='green', linestyle='--', label=f'Q2: {Q2:.1f}')
plt.axvline(Q3, color='red', linestyle='--', label=f'Q3: {Q3:.1f}')
plt.axvline(lower_bound, color='orange', linestyle=':', label=f'Lower Bound: {lower_bound:.1f}')
plt.axvline(upper_bound, color='orange', linestyle=':', label=f'Upper Bound: {upper_bound:.1f}')
plt.title('Distribution with IQR Boundaries')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(fontsize=8)
# Q-Q plot
plt.subplot(2, 2, 3)
stats.probplot(normal_data, dist="norm", plot=plt)
plt.title('Q-Q Plot (Normal Distribution Check)')
# Outlier scatter
plt.subplot(2, 2, 4)
plt.scatter(range(len(normal_data)), normal_data, alpha=0.6, c=['red' if x else 'blue' for x in outliers])
plt.axhline(lower_bound, color='orange', linestyle=':', label='Bounds')
plt.axhline(upper_bound, color='orange', linestyle=':', label='_nolegend_')
plt.title('Data Points with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend(['Outliers', 'Normal', 'Bounds'])
plt.tight_layout()
plt.show()
return Q1, Q2, Q3, IQR, lower_bound, upper_bound
# Demonstrate mathematical foundation
Q1, Q2, Q3, IQR, lower_bound, upper_bound = explain_iqr_mathematics()
def when_to_use_iqr():
"""
Demonstrate scenarios where IQR is preferred over other methods
"""
scenarios = {
'Skewed Data': {
'description': 'When data is not normally distributed',
'example': 'Income distributions, house prices, website traffic'
},
'Unknown Distribution': {
'description': 'When you don\\'t know the underlying distribution',
'example': 'Exploratory data analysis, new datasets'
},
'Robust Detection': {
'description': 'When you need outlier detection resistant to extreme values',
'example': 'Financial data, sensor readings with noise'
},
'Small Sample Size': {
'description': 'When sample size is small and Z-score is unreliable',
'example': 'Clinical trials, A/B test results'
},
'Ordinal Data': {
'description': 'When working with ranked or ordinal data',
'example': 'Survey responses, ratings, rankings'
}
}
print("=== WHEN TO USE IQR OUTLIER DETECTION ===")
for scenario, details in scenarios.items():
print(f"\\n{scenario}:")
print(f" Description: {details['description']}")
print(f" Example: {details['example']}")
return scenarios
# Display usage scenarios
scenarios = when_to_use_iqr()
def iqr_limitations():
"""
Demonstrate limitations of IQR method
"""
limitations = {
'Fixed Threshold': {
'issue': 'The 1.5 multiplier is arbitrary and may not suit all domains',
'solution': 'Use domain-specific multipliers (1.0 for stricter, 2.0 for looser)'
},
'Symmetric Treatment': {
'issue': 'Treats upper and lower outliers equally',
'solution': 'Use asymmetric bounds for highly skewed data'
},
'Contextual Ignorance': {
'issue': 'Doesn\\'t consider business context or seasonality',
'solution': 'Combine with domain knowledge and time-series analysis'
},
'Multiple Dimensions': {
'issue': 'Applies to single dimensions only',
'solution': 'Use multivariate methods for correlated features'
}
}
print("=== IQR LIMITATIONS AND SOLUTIONS ===")
for limitation, details in limitations.items():
print(f"\\n{limitation}:")
print(f" Issue: {details['issue']}")
print(f" Solution: {details['solution']}")
return limitations
# Display limitations
limitations = iqr_limitations()
def detect_outliers_iqr(data, multiplier=1.5, method='tukey'):
"""
Detect outliers using IQR method
Parameters:
-----------
data : array-like
Input data (pandas Series or numpy array)
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation ('tukey', 'inclusive', 'exclusive')
Returns:
--------
outlier_mask : boolean array
True for outliers, False for normal values
bounds : dict
Dictionary containing Q1, Q3, IQR, lower_bound, upper_bound
outlier_info : dict
Additional information about outliers
"""
# Handle missing values
clean_data = pd.Series(data).dropna()
# Calculate quartiles based on method
if method == 'tukey':
Q1 = clean_data.quantile(0.25)
Q3 = clean_data.quantile(0.75)
elif method == 'inclusive':
Q1 = np.percentile(clean_data, 25)
Q3 = np.percentile(clean_data, 75)
elif method == 'exclusive':
Q1 = np.percentile(clean_data, 25, interpolation='lower')
Q3 = np.percentile(clean_data, 75, interpolation='higher')
# Calculate IQR and bounds
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
# Create outlier mask
outlier_mask = (clean_data < lower_bound) | (clean_data > upper_bound)
# Prepare return information
bounds = {
'Q1': Q1,
'Q3': Q3,
'IQR': IQR,
'lower_bound': lower_bound,
'upper_bound': upper_bound,
'multiplier': multiplier
}
outlier_info = {
'n_outliers': outlier_mask.sum(),
'outlier_percentage': (outlier_mask.sum() / len(clean_data)) * 100,
'lower_outliers': (clean_data < lower_bound).sum(),
'upper_outliers': (clean_data > upper_bound).sum(),
'extreme_lower': clean_data[clean_data < lower_bound].min() if (clean_data < lower_bound).any() else None,
'extreme_upper': clean_data[clean_data > upper_bound].max() if (clean_data > upper_bound).any() else None
}
return outlier_mask, bounds, outlier_info
# Load real dataset for demonstration
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
print("Dataset loaded. Shape:", df.shape)
print("\\nColumns:", df.columns.tolist())
print("\\nFirst few rows:")
print(df.head())
def analyze_single_column_iqr(df, column_name, multiplier=1.5):
"""
Comprehensive analysis of a single column using IQR method
"""
print(f"=== IQR ANALYSIS: {column_name} ===")
# Basic statistics
print(f"Column: {column_name}")
print(f"Data type: {df[column_name].dtype}")
print(f"Non-null values: {df[column_name].count()}")
print(f"Mean: {df[column_name].mean():.4f}")
print(f"Median: {df[column_name].median():.4f}")
print(f"Std: {df[column_name].std():.4f}")
# Detect outliers
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier)
# Display results
print(f"\\n--- IQR Statistics ---")
for key, value in bounds.items():
print(f"{key}: {value:.4f}")
print(f"\\n--- Outlier Summary ---")
for key, value in outlier_info.items():
if value is not None:
if isinstance(value, float):
print(f"{key}: {value:.4f}")
else:
print(f"{key}: {value}")
# Create outlier DataFrame
outliers_df = df[outlier_mask].copy()
outliers_df['outlier_type'] = outliers_df[column_name].apply(
lambda x: 'Lower' if x < bounds['lower_bound'] else 'Upper'
)
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Box plot
axes[0, 0].boxplot(df[column_name], patch_artist=True)
axes[0, 0].set_title(f'Box Plot: {column_name}')
axes[0, 0].set_ylabel('Value')
# Histogram with bounds
axes[0, 1].hist(df[column_name], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 1].axvline(bounds['lower_bound'], color='red', linestyle='--', label=f'Lower: {bounds["lower_bound"]:.2f}')
axes[0, 1].axvline(bounds['upper_bound'], color='red', linestyle='--', label=f'Upper: {bounds["upper_bound"]:.2f}')
axes[0, 1].axvline(bounds['Q1'], color='green', linestyle=':', label=f'Q1: {bounds["Q1"]:.2f}')
axes[0, 1].axvline(bounds['Q3'], color='green', linestyle=':', label=f'Q3: {bounds["Q3"]:.2f}')
axes[0, 1].set_title(f'Distribution: {column_name}')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
# Scatter plot with outliers
normal_mask = ~outlier_mask
axes[1, 0].scatter(range(len(df)), df[column_name],
c=['red' if x else 'blue' for x in outlier_mask],
alpha=0.6, s=1)
axes[1, 0].axhline(bounds['lower_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].axhline(bounds['upper_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].set_title(f'Outliers Highlighted: {column_name}')
axes[1, 0].set_xlabel('Index')
axes[1, 0].set_ylabel('Value')
axes[1, 0].legend(['Normal', 'Outliers', 'Bounds'])
# Before/After comparison
clean_data = df[column_name][normal_mask]
axes[1, 1].hist(df[column_name], bins=30, alpha=0.5, label='Original', color='lightcoral')
axes[1, 1].hist(clean_data, bins=30, alpha=0.7, label='After Cleaning', color='lightgreen')
axes[1, 1].set_title('Before vs After Outlier Removal')
axes[1, 1].set_xlabel('Value')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
plt.tight_layout()
plt.show()
# Display top outliers
if not outliers_df.empty:
print(f"\\n--- Top 10 Outliers ---")
top_outliers = outliers_df.nlargest(10, column_name)[[column_name, 'outlier_type']]
print(top_outliers)
return outlier_mask, bounds, outlier_info, outliers_df
# Analyze median house value
outlier_mask, bounds, outlier_info, outliers_df = analyze_single_column_iqr(df, 'target', multiplier=1.5)
def multiplier_sensitivity_analysis(df, column_name, multipliers=[1.0, 1.5, 2.0, 2.5, 3.0]):
"""
Analyze how different multipliers affect outlier detection
"""
print(f"=== MULTIPLIER SENSITIVITY ANALYSIS: {column_name} ===")
results = []
for mult in multipliers:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier=mult)
results.append({
'multiplier': mult,
'n_outliers': outlier_info['n_outliers'],
'outlier_percentage': outlier_info['outlier_percentage'],
'lower_outliers': outlier_info['lower_outliers'],
'upper_outliers': outlier_info['upper_outliers'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
sensitivity_df = pd.DataFrame(results)
print(sensitivity_df.round(3))
# Visualize sensitivity
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count vs multiplier
axes[0].plot(sensitivity_df['multiplier'], sensitivity_df['n_outliers'], 'o-', linewidth=2, markersize=8)
axes[0].set_xlabel('IQR Multiplier')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outlier Count vs Multiplier')
axes[0].grid(True, alpha=0.3)
# Bounds vs multiplier
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['lower_bound'], 'o-', label='Lower Bound', linewidth=2)
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['upper_bound'], 'o-', label='Upper Bound', linewidth=2)
axes[1].set_xlabel('IQR Multiplier')
axes[1].set_ylabel('Bound Value')
axes[1].set_title('Outlier Bounds vs Multiplier')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return sensitivity_df
# Perform sensitivity analysis
sensitivity_results = multiplier_sensitivity_analysis(df, 'target')
def detect_outliers_iqr_multiple(df, columns=None, multiplier=1.5, method='tukey',
strategy='any', min_outlier_columns=1):
"""
Detect outliers across multiple columns using IQR method
Parameters:
-----------
df : pandas DataFrame
Input dataframe
columns : list, optional
Columns to analyze. If None, all numeric columns
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation
strategy : str, default='any'
Strategy for multi-column outliers ('any', 'all', 'majority', 'min_count')
min_outlier_columns : int, default=1
Minimum columns with outliers to flag row (for 'min_count' strategy)
Returns:
--------
results : dict
Comprehensive results for each column and overall
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
# Initialize results storage
column_results = {}
outlier_masks = {}
# Process each column
for col in columns:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[col], multiplier, method)
column_results[col] = {
'bounds': bounds,
'outlier_info': outlier_info,
'outlier_mask': outlier_mask
}
outlier_masks[col] = outlier_mask
# Combine outlier masks based on strategy
combined_mask = pd.Series(False, index=df.index)
if strategy == 'any':
# Row is outlier if it's outlier in ANY column
for col in columns:
combined_mask |= outlier_masks[col]
elif strategy == 'all':
# Row is outlier if it's outlier in ALL columns
combined_mask = pd.Series(True, index=df.index)
for col in columns:
combined_mask &= outlier_masks[col]
elif strategy == 'majority':
# Row is outlier if it's outlier in majority of columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts > (len(columns) / 2)
elif strategy == 'min_count':
# Row is outlier if it's outlier in at least min_outlier_columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts >= min_outlier_columns
# Calculate overall statistics
overall_stats = {
'total_rows': len(df),
'outlier_rows': combined_mask.sum(),
'outlier_percentage': (combined_mask.sum() / len(df)) * 100,
'strategy': strategy,
'columns_analyzed': columns,
'multiplier': multiplier
}
# Create outlier summary by column
column_summary = []
for col in columns:
info = column_results[col]['outlier_info']
bounds = column_results[col]['bounds']
column_summary.append({
'column': col,
'n_outliers': info['n_outliers'],
'outlier_percentage': info['outlier_percentage'],
'lower_outliers': info['lower_outliers'],
'upper_outliers': info['upper_outliers'],
'Q1': bounds['Q1'],
'Q3': bounds['Q3'],
'IQR': bounds['IQR'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
column_summary_df = pd.DataFrame(column_summary)
# Create detailed outlier information
outlier_details = []
for idx in df.index[combined_mask]:
row_info = {'index': idx}
outlier_columns = []
for col in columns:
if outlier_masks[col][idx]:
outlier_columns.append(col)
row_info[f'{col}_value'] = df.loc[idx, col]
row_info[f'{col}_type'] = 'Lower' if df.loc[idx, col] < column_results[col]['bounds']['lower_bound'] else 'Upper'
row_info['outlier_columns'] = outlier_columns
row_info['n_outlier_columns'] = len(outlier_columns)
outlier_details.append(row_info)
outlier_details_df = pd.DataFrame(outlier_details)
return {
'column_results': column_results,
'column_summary': column_summary_df,
'outlier_details': outlier_details_df,
'combined_mask': combined_mask,
'overall_stats': overall_stats,
'clean_data': df[~combined_mask].copy()
}
# Apply multi-column analysis
multi_results = detect_outliers_iqr_multiple(df, multiplier=1.5, strategy='any')
print("=== MULTI-COLUMN IQR ANALYSIS ===")
print(f"Strategy: {multi_results['overall_stats']['strategy']}")
print(f"Total rows: {multi_results['overall_stats']['total_rows']}")
print(f"Outlier rows: {multi_results['overall_stats']['outlier_rows']}")
print(f"Outlier percentage: {multi_results['overall_stats']['outlier_percentage']:.2f}%")
print("\\n=== COLUMN SUMMARY ===")
print(multi_results['column_summary'].round(3))
print("\\n=== TOP 10 OUTLIER ROWS ===")
if not multi_results['outlier_details'].empty:
top_outliers = multi_results['outlier_details'].nlargest(10, 'n_outlier_columns')
print(top_outliers[['index', 'outlier_columns', 'n_outlier_columns']].head(10))
def compare_outlier_strategies(df, columns=None, multiplier=1.5):
"""
Compare different strategies for multi-column outlier detection
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
strategies = ['any', 'all', 'majority', 'min_count']
strategy_results = {}
print("=== STRATEGY COMPARISON ===")
for strategy in strategies:
if strategy == 'min_count':
# Test different min_count values
for min_count in [1, 2, 3]:
key = f"{strategy}_{min_count}"
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy, min_outlier_columns=min_count)
strategy_results[key] = results['overall_stats']
else:
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy)
strategy_results[strategy] = results['overall_stats']
# Create comparison DataFrame
comparison_data = []
for strategy_name, stats in strategy_results.items():
comparison_data.append({
'strategy': strategy_name,
'outlier_rows': stats['outlier_rows'],
'outlier_percentage': stats['outlier_percentage'],
'remaining_rows': stats['total_rows'] - stats['outlier_rows'],
'data_retention': ((stats['total_rows'] - stats['outlier_rows']) / stats['total_rows']) * 100
})
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.round(2))
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count comparison
axes[0].bar(comparison_df['strategy'], comparison_df['outlier_rows'], color='lightcoral', alpha=0.7)
axes[0].set_xlabel('Strategy')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outliers Detected by Strategy')
axes[0].tick_params(axis='x', rotation=45)
# Data retention comparison
axes[1].bar(comparison_df['strategy'], comparison_df['data_retention'], color='lightgreen', alpha=0.7)
axes[1].set_xlabel('Strategy')
axes[1].set_ylabel('Data Retention (%)')
axes[1].set_title('Data Retention by Strategy')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
return comparison_df
# Compare strategies
strategy_comparison = compare_outlier_strategies(df)
def adaptive_iqr_detection(df, column_configs=None, global_multiplier=1.5):
"""
Adaptive IQR detection with column-specific configurations
Parameters:
-----------
df : pandas DataFrame
Input dataframe
column_configs : dict, optional
Column-specific configurations
Example: {'column_name': {'multiplier': 2.0, 'bounds': 'asymmetric'}}
global_multiplier : float, default=1.5
Default multiplier for columns not in column_configs
"""
if column_configs is None:
column_configs = {}
numeric_columns = df.select_dtypes(include=[np.number]).columns
results = {}
for col in numeric_columns:
config = column_configs.get(col, {})
multiplier = config.get('multiplier', global_multiplier)
bounds_type = config.get('bounds', 'symmetric')
# Calculate basic IQR components
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
# Apply