import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_wine
import warnings
warnings.filterwarnings('ignore')
def explain_iqr_mathematics():
"""
Demonstrate the mathematical foundation of IQR
"""
# Generate sample data
np.random.seed(42)
normal_data = np.random.normal(100, 15, 1000)
# Calculate quartiles
Q1 = np.percentile(normal_data, 25)
Q2 = np.percentile(normal_data, 50) # Median
Q3 = np.percentile(normal_data, 75)
IQR = Q3 - Q1
# Calculate bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("=== IQR MATHEMATICAL BREAKDOWN ===")
print(f"Q1 (25th percentile): {Q1:.2f}")
print(f"Q2 (50th percentile/Median): {Q2:.2f}")
print(f"Q3 (75th percentile): {Q3:.2f}")
print(f"IQR (Q3 - Q1): {IQR:.2f}")
print(f"Lower Bound (Q1 - 1.5×IQR): {lower_bound:.2f}")
print(f"Upper Bound (Q3 + 1.5×IQR): {upper_bound:.2f}")
# Count outliers
outliers = (normal_data < lower_bound) | (normal_data > upper_bound)
print(f"Outliers detected: {outliers.sum()} ({(outliers.sum()/len(normal_data))*100:.2f}%)")
# Visualize
plt.figure(figsize=(12, 8))
# Box plot
plt.subplot(2, 2, 1)
box_plot = plt.boxplot(normal_data, patch_artist=True, labels=['Data'])
box_plot['boxes'][0].set_facecolor('lightblue')
plt.title('Box Plot Showing IQR Components')
plt.ylabel('Value')
# Histogram with quartiles
plt.subplot(2, 2, 2)
plt.hist(normal_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(Q1, color='red', linestyle='--', label=f'Q1: {Q1:.1f}')
plt.axvline(Q2, color='green', linestyle='--', label=f'Q2: {Q2:.1f}')
plt.axvline(Q3, color='red', linestyle='--', label=f'Q3: {Q3:.1f}')
plt.axvline(lower_bound, color='orange', linestyle=':', label=f'Lower Bound: {lower_bound:.1f}')
plt.axvline(upper_bound, color='orange', linestyle=':', label=f'Upper Bound: {upper_bound:.1f}')
plt.title('Distribution with IQR Boundaries')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(fontsize=8)
# Q-Q plot
plt.subplot(2, 2, 3)
stats.probplot(normal_data, dist="norm", plot=plt)
plt.title('Q-Q Plot (Normal Distribution Check)')
# Outlier scatter
plt.subplot(2, 2, 4)
plt.scatter(range(len(normal_data)), normal_data, alpha=0.6, c=['red' if x else 'blue' for x in outliers])
plt.axhline(lower_bound, color='orange', linestyle=':', label='Bounds')
plt.axhline(upper_bound, color='orange', linestyle=':', label='_nolegend_')
plt.title('Data Points with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend(['Outliers', 'Normal', 'Bounds'])
plt.tight_layout()
plt.show()
return Q1, Q2, Q3, IQR, lower_bound, upper_bound
# Demonstrate mathematical foundation
Q1, Q2, Q3, IQR, lower_bound, upper_bound = explain_iqr_mathematics()
def when_to_use_iqr():
"""
Demonstrate scenarios where IQR is preferred over other methods
"""
scenarios = {
'Skewed Data': {
'description': 'When data is not normally distributed',
'example': 'Income distributions, house prices, website traffic'
},
'Unknown Distribution': {
'description': 'When you don\\'t know the underlying distribution',
'example': 'Exploratory data analysis, new datasets'
},
'Robust Detection': {
'description': 'When you need outlier detection resistant to extreme values',
'example': 'Financial data, sensor readings with noise'
},
'Small Sample Size': {
'description': 'When sample size is small and Z-score is unreliable',
'example': 'Clinical trials, A/B test results'
},
'Ordinal Data': {
'description': 'When working with ranked or ordinal data',
'example': 'Survey responses, ratings, rankings'
}
}
print("=== WHEN TO USE IQR OUTLIER DETECTION ===")
for scenario, details in scenarios.items():
print(f"\\n{scenario}:")
print(f" Description: {details['description']}")
print(f" Example: {details['example']}")
return scenarios
# Display usage scenarios
scenarios = when_to_use_iqr()
def iqr_limitations():
"""
Demonstrate limitations of IQR method
"""
limitations = {
'Fixed Threshold': {
'issue': 'The 1.5 multiplier is arbitrary and may not suit all domains',
'solution': 'Use domain-specific multipliers (1.0 for stricter, 2.0 for looser)'
},
'Symmetric Treatment': {
'issue': 'Treats upper and lower outliers equally',
'solution': 'Use asymmetric bounds for highly skewed data'
},
'Contextual Ignorance': {
'issue': 'Doesn\\'t consider business context or seasonality',
'solution': 'Combine with domain knowledge and time-series analysis'
},
'Multiple Dimensions': {
'issue': 'Applies to single dimensions only',
'solution': 'Use multivariate methods for correlated features'
}
}
print("=== IQR LIMITATIONS AND SOLUTIONS ===")
for limitation, details in limitations.items():
print(f"\\n{limitation}:")
print(f" Issue: {details['issue']}")
print(f" Solution: {details['solution']}")
return limitations
# Display limitations
limitations = iqr_limitations()
def detect_outliers_iqr(data, multiplier=1.5, method='tukey'):
"""
Detect outliers using IQR method
Parameters:
-----------
data : array-like
Input data (pandas Series or numpy array)
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation ('tukey', 'inclusive', 'exclusive')
Returns:
--------
outlier_mask : boolean array
True for outliers, False for normal values
bounds : dict
Dictionary containing Q1, Q3, IQR, lower_bound, upper_bound
outlier_info : dict
Additional information about outliers
"""
# Handle missing values
clean_data = pd.Series(data).dropna()
# Calculate quartiles based on method
if method == 'tukey':
Q1 = clean_data.quantile(0.25)
Q3 = clean_data.quantile(0.75)
elif method == 'inclusive':
Q1 = np.percentile(clean_data, 25)
Q3 = np.percentile(clean_data, 75)
elif method == 'exclusive':
Q1 = np.percentile(clean_data, 25, interpolation='lower')
Q3 = np.percentile(clean_data, 75, interpolation='higher')
# Calculate IQR and bounds
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
# Create outlier mask
outlier_mask = (clean_data < lower_bound) | (clean_data > upper_bound)
# Prepare return information
bounds = {
'Q1': Q1,
'Q3': Q3,
'IQR': IQR,
'lower_bound': lower_bound,
'upper_bound': upper_bound,
'multiplier': multiplier
}
outlier_info = {
'n_outliers': outlier_mask.sum(),
'outlier_percentage': (outlier_mask.sum() / len(clean_data)) * 100,
'lower_outliers': (clean_data < lower_bound).sum(),
'upper_outliers': (clean_data > upper_bound).sum(),
'extreme_lower': clean_data[clean_data < lower_bound].min() if (clean_data < lower_bound).any() else None,
'extreme_upper': clean_data[clean_data > upper_bound].max() if (clean_data > upper_bound).any() else None
}
return outlier_mask, bounds, outlier_info
# Load real dataset for demonstration
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
print("Dataset loaded. Shape:", df.shape)
print("\\nColumns:", df.columns.tolist())
print("\\nFirst few rows:")
print(df.head())
def analyze_single_column_iqr(df, column_name, multiplier=1.5):
"""
Comprehensive analysis of a single column using IQR method
"""
print(f"=== IQR ANALYSIS: {column_name} ===")
# Basic statistics
print(f"Column: {column_name}")
print(f"Data type: {df[column_name].dtype}")
print(f"Non-null values: {df[column_name].count()}")
print(f"Mean: {df[column_name].mean():.4f}")
print(f"Median: {df[column_name].median():.4f}")
print(f"Std: {df[column_name].std():.4f}")
# Detect outliers
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier)
# Display results
print(f"\\n--- IQR Statistics ---")
for key, value in bounds.items():
print(f"{key}: {value:.4f}")
print(f"\\n--- Outlier Summary ---")
for key, value in outlier_info.items():
if value is not None:
if isinstance(value, float):
print(f"{key}: {value:.4f}")
else:
print(f"{key}: {value}")
# Create outlier DataFrame
outliers_df = df[outlier_mask].copy()
outliers_df['outlier_type'] = outliers_df[column_name].apply(
lambda x: 'Lower' if x < bounds['lower_bound'] else 'Upper'
)
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Box plot
axes[0, 0].boxplot(df[column_name], patch_artist=True)
axes[0, 0].set_title(f'Box Plot: {column_name}')
axes[0, 0].set_ylabel('Value')
# Histogram with bounds
axes[0, 1].hist(df[column_name], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 1].axvline(bounds['lower_bound'], color='red', linestyle='--', label=f'Lower: {bounds["lower_bound"]:.2f}')
axes[0, 1].axvline(bounds['upper_bound'], color='red', linestyle='--', label=f'Upper: {bounds["upper_bound"]:.2f}')
axes[0, 1].axvline(bounds['Q1'], color='green', linestyle=':', label=f'Q1: {bounds["Q1"]:.2f}')
axes[0, 1].axvline(bounds['Q3'], color='green', linestyle=':', label=f'Q3: {bounds["Q3"]:.2f}')
axes[0, 1].set_title(f'Distribution: {column_name}')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
# Scatter plot with outliers
normal_mask = ~outlier_mask
axes[1, 0].scatter(range(len(df)), df[column_name],
c=['red' if x else 'blue' for x in outlier_mask],
alpha=0.6, s=1)
axes[1, 0].axhline(bounds['lower_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].axhline(bounds['upper_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].set_title(f'Outliers Highlighted: {column_name}')
axes[1, 0].set_xlabel('Index')
axes[1, 0].set_ylabel('Value')
axes[1, 0].legend(['Normal', 'Outliers', 'Bounds'])
# Before/After comparison
clean_data = df[column_name][normal_mask]
axes[1, 1].hist(df[column_name], bins=30, alpha=0.5, label='Original', color='lightcoral')
axes[1, 1].hist(clean_data, bins=30, alpha=0.7, label='After Cleaning', color='lightgreen')
axes[1, 1].set_title('Before vs After Outlier Removal')
axes[1, 1].set_xlabel('Value')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
plt.tight_layout()
plt.show()
# Display top outliers
if not outliers_df.empty:
print(f"\\n--- Top 10 Outliers ---")
top_outliers = outliers_df.nlargest(10, column_name)[[column_name, 'outlier_type']]
print(top_outliers)
return outlier_mask, bounds, outlier_info, outliers_df
# Analyze median house value
outlier_mask, bounds, outlier_info, outliers_df = analyze_single_column_iqr(df, 'target', multiplier=1.5)
def multiplier_sensitivity_analysis(df, column_name, multipliers=[1.0, 1.5, 2.0, 2.5, 3.0]):
"""
Analyze how different multipliers affect outlier detection
"""
print(f"=== MULTIPLIER SENSITIVITY ANALYSIS: {column_name} ===")
results = []
for mult in multipliers:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier=mult)
results.append({
'multiplier': mult,
'n_outliers': outlier_info['n_outliers'],
'outlier_percentage': outlier_info['outlier_percentage'],
'lower_outliers': outlier_info['lower_outliers'],
'upper_outliers': outlier_info['upper_outliers'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
sensitivity_df = pd.DataFrame(results)
print(sensitivity_df.round(3))
# Visualize sensitivity
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count vs multiplier
axes[0].plot(sensitivity_df['multiplier'], sensitivity_df['n_outliers'], 'o-', linewidth=2, markersize=8)
axes[0].set_xlabel('IQR Multiplier')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outlier Count vs Multiplier')
axes[0].grid(True, alpha=0.3)
# Bounds vs multiplier
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['lower_bound'], 'o-', label='Lower Bound', linewidth=2)
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['upper_bound'], 'o-', label='Upper Bound', linewidth=2)
axes[1].set_xlabel('IQR Multiplier')
axes[1].set_ylabel('Bound Value')
axes[1].set_title('Outlier Bounds vs Multiplier')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return sensitivity_df
# Perform sensitivity analysis
sensitivity_results = multiplier_sensitivity_analysis(df, 'target')
def detect_outliers_iqr_multiple(df, columns=None, multiplier=1.5, method='tukey',
strategy='any', min_outlier_columns=1):
"""
Detect outliers across multiple columns using IQR method
Parameters:
-----------
df : pandas DataFrame
Input dataframe
columns : list, optional
Columns to analyze. If None, all numeric columns
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation
strategy : str, default='any'
Strategy for multi-column outliers ('any', 'all', 'majority', 'min_count')
min_outlier_columns : int, default=1
Minimum columns with outliers to flag row (for 'min_count' strategy)
Returns:
--------
results : dict
Comprehensive results for each column and overall
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
# Initialize results storage
column_results = {}
outlier_masks = {}
# Process each column
for col in columns:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[col], multiplier, method)
column_results[col] = {
'bounds': bounds,
'outlier_info': outlier_info,
'outlier_mask': outlier_mask
}
outlier_masks[col] = outlier_mask
# Combine outlier masks based on strategy
combined_mask = pd.Series(False, index=df.index)
if strategy == 'any':
# Row is outlier if it's outlier in ANY column
for col in columns:
combined_mask |= outlier_masks[col]
elif strategy == 'all':
# Row is outlier if it's outlier in ALL columns
combined_mask = pd.Series(True, index=df.index)
for col in columns:
combined_mask &= outlier_masks[col]
elif strategy == 'majority':
# Row is outlier if it's outlier in majority of columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts > (len(columns) / 2)
elif strategy == 'min_count':
# Row is outlier if it's outlier in at least min_outlier_columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts >= min_outlier_columns
# Calculate overall statistics
overall_stats = {
'total_rows': len(df),
'outlier_rows': combined_mask.sum(),
'outlier_percentage': (combined_mask.sum() / len(df)) * 100,
'strategy': strategy,
'columns_analyzed': columns,
'multiplier': multiplier
}
# Create outlier summary by column
column_summary = []
for col in columns:
info = column_results[col]['outlier_info']
bounds = column_results[col]['bounds']
column_summary.append({
'column': col,
'n_outliers': info['n_outliers'],
'outlier_percentage': info['outlier_percentage'],
'lower_outliers': info['lower_outliers'],
'upper_outliers': info['upper_outliers'],
'Q1': bounds['Q1'],
'Q3': bounds['Q3'],
'IQR': bounds['IQR'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
column_summary_df = pd.DataFrame(column_summary)
# Create detailed outlier information
outlier_details = []
for idx in df.index[combined_mask]:
row_info = {'index': idx}
outlier_columns = []
for col in columns:
if outlier_masks[col][idx]:
outlier_columns.append(col)
row_info[f'{col}_value'] = df.loc[idx, col]
row_info[f'{col}_type'] = 'Lower' if df.loc[idx, col] < column_results[col]['bounds']['lower_bound'] else 'Upper'
row_info['outlier_columns'] = outlier_columns
row_info['n_outlier_columns'] = len(outlier_columns)
outlier_details.append(row_info)
outlier_details_df = pd.DataFrame(outlier_details)
return {
'column_results': column_results,
'column_summary': column_summary_df,
'outlier_details': outlier_details_df,
'combined_mask': combined_mask,
'overall_stats': overall_stats,
'clean_data': df[~combined_mask].copy()
}
# Apply multi-column analysis
multi_results = detect_outliers_iqr_multiple(df, multiplier=1.5, strategy='any')
print("=== MULTI-COLUMN IQR ANALYSIS ===")
print(f"Strategy: {multi_results['overall_stats']['strategy']}")
print(f"Total rows: {multi_results['overall_stats']['total_rows']}")
print(f"Outlier rows: {multi_results['overall_stats']['outlier_rows']}")
print(f"Outlier percentage: {multi_results['overall_stats']['outlier_percentage']:.2f}%")
print("\\n=== COLUMN SUMMARY ===")
print(multi_results['column_summary'].round(3))
print("\\n=== TOP 10 OUTLIER ROWS ===")
if not multi_results['outlier_details'].empty:
top_outliers = multi_results['outlier_details'].nlargest(10, 'n_outlier_columns')
print(top_outliers[['index', 'outlier_columns', 'n_outlier_columns']].head(10))
def compare_outlier_strategies(df, columns=None, multiplier=1.5):
"""
Compare different strategies for multi-column outlier detection
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
strategies = ['any', 'all', 'majority', 'min_count']
strategy_results = {}
print("=== STRATEGY COMPARISON ===")
for strategy in strategies:
if strategy == 'min_count':
# Test different min_count values
for min_count in [1, 2, 3]:
key = f"{strategy}_{min_count}"
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy, min_outlier_columns=min_count)
strategy_results[key] = results['overall_stats']
else:
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy)
strategy_results[strategy] = results['overall_stats']
# Create comparison DataFrame
comparison_data = []
for strategy_name, stats in strategy_results.items():
comparison_data.append({
'strategy': strategy_name,
'outlier_rows': stats['outlier_rows'],
'outlier_percentage': stats['outlier_percentage'],
'remaining_rows': stats['total_rows'] - stats['outlier_rows'],
'data_retention': ((stats['total_rows'] - stats['outlier_rows']) / stats['total_rows']) * 100
})
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.round(2))
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count comparison
axes[0].bar(comparison_df['strategy'], comparison_df['outlier_rows'], color='lightcoral', alpha=0.7)
axes[0].set_xlabel('Strategy')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outliers Detected by Strategy')
axes[0].tick_params(axis='x', rotation=45)
# Data retention comparison
axes[1].bar(comparison_df['strategy'], comparison_df['data_retention'], color='lightgreen', alpha=0.7)
axes[1].set_xlabel('Strategy')
axes[1].set_ylabel('Data Retention (%)')
axes[1].set_title('Data Retention by Strategy')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
return comparison_df
# Compare strategies
strategy_comparison = compare_outlier_strategies(df)
def adaptive_iqr_detection(df, column_configs=None, global_multiplier=1.5):
"""
Adaptive IQR detection with column-specific configurations
Parameters:
-----------
df : pandas DataFrame
Input dataframe
column_configs : dict, optional
Column-specific configurations
Example: {'column_name': {'multiplier': 2.0, 'bounds': 'asymmetric'}}
global_multiplier : float, default=1.5
Default multiplier for columns not in column_configs
"""
if column_configs is None:
column_configs = {}
numeric_columns = df.select_dtypes(include=[np.number]).columns
results = {}
for col in numeric_columns:
config = column_configs.get(col, {})
multiplier = config.get('multiplier', global_multiplier)
bounds_type = config.get('bounds', 'symmetric')
# Calculate basic IQR components
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
# Apply