Text
Text 1
IQR (Interquartile Range) Outlier Detection: Complete Data Science Guide
Table of Contents
- Mathematical Foundation
- Key Assumptions & When to Use
- Implementation: Single Column
- Implementation: Multiple Columns
- Advanced Techniques
- Comparison with Other Methods
- Best Practices & Decision Framework
- Real-World Applications
Mathematical Foundation
What is IQR?
The Interquartile Range (IQR) is a robust measure of statistical dispersion that represents the range between the 25th and 75th percentiles of a dataset.
Mathematical Definition:
- Q1 (First Quartile): 25th percentile
- Q3 (Third Quartile): 75th percentile
- IQR = Q3 - Q1
Outlier Detection Formula
Standard IQR Method:
- Lower Bound: Q1 - 1.5 × IQR
- Upper Bound: Q3 + 1.5 × IQR
- Outliers: Any value < Lower Bound OR > Upper Bound
Why 1.5? The 1.5 multiplier comes from John Tukey’s work on exploratory data analysis. It’s designed to capture approximately 99.3% of data in a normal distribution while being robust to the actual distribution shape.
Mathematical Properties
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_wine
import warnings
warnings.filterwarnings('ignore')
def explain_iqr_mathematics():
"""
Demonstrate the mathematical foundation of IQR
"""
# Generate sample data
np.random.seed(42)
normal_data = np.random.normal(100, 15, 1000)
# Calculate quartiles
Q1 = np.percentile(normal_data, 25)
Q2 = np.percentile(normal_data, 50) # Median
Q3 = np.percentile(normal_data, 75)
IQR = Q3 - Q1
# Calculate bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("=== IQR MATHEMATICAL BREAKDOWN ===")
print(f"Q1 (25th percentile): {Q1:.2f}")
print(f"Q2 (50th percentile/Median): {Q2:.2f}")
print(f"Q3 (75th percentile): {Q3:.2f}")
print(f"IQR (Q3 - Q1): {IQR:.2f}")
print(f"Lower Bound (Q1 - 1.5×IQR): {lower_bound:.2f}")
print(f"Upper Bound (Q3 + 1.5×IQR): {upper_bound:.2f}")
# Count outliers
outliers = (normal_data < lower_bound) | (normal_data > upper_bound)
print(f"Outliers detected: {outliers.sum()} ({(outliers.sum()/len(normal_data))*100:.2f}%)")
# Visualize
plt.figure(figsize=(12, 8))
# Box plot
plt.subplot(2, 2, 1)
box_plot = plt.boxplot(normal_data, patch_artist=True, labels=['Data'])
box_plot['boxes'][0].set_facecolor('lightblue')
plt.title('Box Plot Showing IQR Components')
plt.ylabel('Value')
# Histogram with quartiles
plt.subplot(2, 2, 2)
plt.hist(normal_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(Q1, color='red', linestyle='--', label=f'Q1: {Q1:.1f}')
plt.axvline(Q2, color='green', linestyle='--', label=f'Q2: {Q2:.1f}')
plt.axvline(Q3, color='red', linestyle='--', label=f'Q3: {Q3:.1f}')
plt.axvline(lower_bound, color='orange', linestyle=':', label=f'Lower Bound: {lower_bound:.1f}')
plt.axvline(upper_bound, color='orange', linestyle=':', label=f'Upper Bound: {upper_bound:.1f}')
plt.title('Distribution with IQR Boundaries')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(fontsize=8)
# Q-Q plot
plt.subplot(2, 2, 3)
stats.probplot(normal_data, dist="norm", plot=plt)
plt.title('Q-Q Plot (Normal Distribution Check)')
# Outlier scatter
plt.subplot(2, 2, 4)
plt.scatter(range(len(normal_data)), normal_data, alpha=0.6, c=['red' if x else 'blue' for x in outliers])
plt.axhline(lower_bound, color='orange', linestyle=':', label='Bounds')
plt.axhline(upper_bound, color='orange', linestyle=':', label='_nolegend_')
plt.title('Data Points with Outliers Highlighted')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend(['Outliers', 'Normal', 'Bounds'])
plt.tight_layout()
plt.show()
return Q1, Q2, Q3, IQR, lower_bound, upper_bound
# Demonstrate mathematical foundation
Q1, Q2, Q3, IQR, lower_bound, upper_bound = explain_iqr_mathematics()
Key Assumptions & When to Use
✅ Assumptions of IQR Method
- Distribution Independence:
- IQR works with ANY distribution shape (normal, skewed, multimodal)
- No assumption about underlying distribution
- Ordinal Data:
- Data must be at least ordinal (can be ranked)
- Works with continuous, discrete, and ordinal variables
- Sample Size:
- More robust with larger samples (n > 30 recommended)
- Can work with smaller samples but less reliable
- Outlier Definition:
- Assumes outliers are values unusually far from the central 50% of data
- Based on positional statistics, not probabilistic
🎯 When to Use IQR
def when_to_use_iqr():
"""
Demonstrate scenarios where IQR is preferred over other methods
"""
scenarios = {
'Skewed Data': {
'description': 'When data is not normally distributed',
'example': 'Income distributions, house prices, website traffic'
},
'Unknown Distribution': {
'description': 'When you don\\'t know the underlying distribution',
'example': 'Exploratory data analysis, new datasets'
},
'Robust Detection': {
'description': 'When you need outlier detection resistant to extreme values',
'example': 'Financial data, sensor readings with noise'
},
'Small Sample Size': {
'description': 'When sample size is small and Z-score is unreliable',
'example': 'Clinical trials, A/B test results'
},
'Ordinal Data': {
'description': 'When working with ranked or ordinal data',
'example': 'Survey responses, ratings, rankings'
}
}
print("=== WHEN TO USE IQR OUTLIER DETECTION ===")
for scenario, details in scenarios.items():
print(f"\\n{scenario}:")
print(f" Description: {details['description']}")
print(f" Example: {details['example']}")
return scenarios
# Display usage scenarios
scenarios = when_to_use_iqr()
⚠️ Limitations and Considerations
def iqr_limitations():
"""
Demonstrate limitations of IQR method
"""
limitations = {
'Fixed Threshold': {
'issue': 'The 1.5 multiplier is arbitrary and may not suit all domains',
'solution': 'Use domain-specific multipliers (1.0 for stricter, 2.0 for looser)'
},
'Symmetric Treatment': {
'issue': 'Treats upper and lower outliers equally',
'solution': 'Use asymmetric bounds for highly skewed data'
},
'Contextual Ignorance': {
'issue': 'Doesn\\'t consider business context or seasonality',
'solution': 'Combine with domain knowledge and time-series analysis'
},
'Multiple Dimensions': {
'issue': 'Applies to single dimensions only',
'solution': 'Use multivariate methods for correlated features'
}
}
print("=== IQR LIMITATIONS AND SOLUTIONS ===")
for limitation, details in limitations.items():
print(f"\\n{limitation}:")
print(f" Issue: {details['issue']}")
print(f" Solution: {details['solution']}")
return limitations
# Display limitations
limitations = iqr_limitations()
Implementation: Single Column
Core IQR Function
def detect_outliers_iqr(data, multiplier=1.5, method='tukey'):
"""
Detect outliers using IQR method
Parameters:
-----------
data : array-like
Input data (pandas Series or numpy array)
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation ('tukey', 'inclusive', 'exclusive')
Returns:
--------
outlier_mask : boolean array
True for outliers, False for normal values
bounds : dict
Dictionary containing Q1, Q3, IQR, lower_bound, upper_bound
outlier_info : dict
Additional information about outliers
"""
# Handle missing values
clean_data = pd.Series(data).dropna()
# Calculate quartiles based on method
if method == 'tukey':
Q1 = clean_data.quantile(0.25)
Q3 = clean_data.quantile(0.75)
elif method == 'inclusive':
Q1 = np.percentile(clean_data, 25)
Q3 = np.percentile(clean_data, 75)
elif method == 'exclusive':
Q1 = np.percentile(clean_data, 25, interpolation='lower')
Q3 = np.percentile(clean_data, 75, interpolation='higher')
# Calculate IQR and bounds
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
# Create outlier mask
outlier_mask = (clean_data < lower_bound) | (clean_data > upper_bound)
# Prepare return information
bounds = {
'Q1': Q1,
'Q3': Q3,
'IQR': IQR,
'lower_bound': lower_bound,
'upper_bound': upper_bound,
'multiplier': multiplier
}
outlier_info = {
'n_outliers': outlier_mask.sum(),
'outlier_percentage': (outlier_mask.sum() / len(clean_data)) * 100,
'lower_outliers': (clean_data < lower_bound).sum(),
'upper_outliers': (clean_data > upper_bound).sum(),
'extreme_lower': clean_data[clean_data < lower_bound].min() if (clean_data < lower_bound).any() else None,
'extreme_upper': clean_data[clean_data > upper_bound].max() if (clean_data > upper_bound).any() else None
}
return outlier_mask, bounds, outlier_info
# Load real dataset for demonstration
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
print("Dataset loaded. Shape:", df.shape)
print("\\nColumns:", df.columns.tolist())
print("\\nFirst few rows:")
print(df.head())
Single Column Analysis Example
def analyze_single_column_iqr(df, column_name, multiplier=1.5):
"""
Comprehensive analysis of a single column using IQR method
"""
print(f"=== IQR ANALYSIS: {column_name} ===")
# Basic statistics
print(f"Column: {column_name}")
print(f"Data type: {df[column_name].dtype}")
print(f"Non-null values: {df[column_name].count()}")
print(f"Mean: {df[column_name].mean():.4f}")
print(f"Median: {df[column_name].median():.4f}")
print(f"Std: {df[column_name].std():.4f}")
# Detect outliers
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier)
# Display results
print(f"\\n--- IQR Statistics ---")
for key, value in bounds.items():
print(f"{key}: {value:.4f}")
print(f"\\n--- Outlier Summary ---")
for key, value in outlier_info.items():
if value is not None:
if isinstance(value, float):
print(f"{key}: {value:.4f}")
else:
print(f"{key}: {value}")
# Create outlier DataFrame
outliers_df = df[outlier_mask].copy()
outliers_df['outlier_type'] = outliers_df[column_name].apply(
lambda x: 'Lower' if x < bounds['lower_bound'] else 'Upper'
)
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Box plot
axes[0, 0].boxplot(df[column_name], patch_artist=True)
axes[0, 0].set_title(f'Box Plot: {column_name}')
axes[0, 0].set_ylabel('Value')
# Histogram with bounds
axes[0, 1].hist(df[column_name], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 1].axvline(bounds['lower_bound'], color='red', linestyle='--', label=f'Lower: {bounds["lower_bound"]:.2f}')
axes[0, 1].axvline(bounds['upper_bound'], color='red', linestyle='--', label=f'Upper: {bounds["upper_bound"]:.2f}')
axes[0, 1].axvline(bounds['Q1'], color='green', linestyle=':', label=f'Q1: {bounds["Q1"]:.2f}')
axes[0, 1].axvline(bounds['Q3'], color='green', linestyle=':', label=f'Q3: {bounds["Q3"]:.2f}')
axes[0, 1].set_title(f'Distribution: {column_name}')
axes[0, 1].set_xlabel('Value')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
# Scatter plot with outliers
normal_mask = ~outlier_mask
axes[1, 0].scatter(range(len(df)), df[column_name],
c=['red' if x else 'blue' for x in outlier_mask],
alpha=0.6, s=1)
axes[1, 0].axhline(bounds['lower_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].axhline(bounds['upper_bound'], color='red', linestyle='--', alpha=0.8)
axes[1, 0].set_title(f'Outliers Highlighted: {column_name}')
axes[1, 0].set_xlabel('Index')
axes[1, 0].set_ylabel('Value')
axes[1, 0].legend(['Normal', 'Outliers', 'Bounds'])
# Before/After comparison
clean_data = df[column_name][normal_mask]
axes[1, 1].hist(df[column_name], bins=30, alpha=0.5, label='Original', color='lightcoral')
axes[1, 1].hist(clean_data, bins=30, alpha=0.7, label='After Cleaning', color='lightgreen')
axes[1, 1].set_title('Before vs After Outlier Removal')
axes[1, 1].set_xlabel('Value')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
plt.tight_layout()
plt.show()
# Display top outliers
if not outliers_df.empty:
print(f"\\n--- Top 10 Outliers ---")
top_outliers = outliers_df.nlargest(10, column_name)[[column_name, 'outlier_type']]
print(top_outliers)
return outlier_mask, bounds, outlier_info, outliers_df
# Analyze median house value
outlier_mask, bounds, outlier_info, outliers_df = analyze_single_column_iqr(df, 'target', multiplier=1.5)
Multiplier Sensitivity Analysis
def multiplier_sensitivity_analysis(df, column_name, multipliers=[1.0, 1.5, 2.0, 2.5, 3.0]):
"""
Analyze how different multipliers affect outlier detection
"""
print(f"=== MULTIPLIER SENSITIVITY ANALYSIS: {column_name} ===")
results = []
for mult in multipliers:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[column_name], multiplier=mult)
results.append({
'multiplier': mult,
'n_outliers': outlier_info['n_outliers'],
'outlier_percentage': outlier_info['outlier_percentage'],
'lower_outliers': outlier_info['lower_outliers'],
'upper_outliers': outlier_info['upper_outliers'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
sensitivity_df = pd.DataFrame(results)
print(sensitivity_df.round(3))
# Visualize sensitivity
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count vs multiplier
axes[0].plot(sensitivity_df['multiplier'], sensitivity_df['n_outliers'], 'o-', linewidth=2, markersize=8)
axes[0].set_xlabel('IQR Multiplier')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outlier Count vs Multiplier')
axes[0].grid(True, alpha=0.3)
# Bounds vs multiplier
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['lower_bound'], 'o-', label='Lower Bound', linewidth=2)
axes[1].plot(sensitivity_df['multiplier'], sensitivity_df['upper_bound'], 'o-', label='Upper Bound', linewidth=2)
axes[1].set_xlabel('IQR Multiplier')
axes[1].set_ylabel('Bound Value')
axes[1].set_title('Outlier Bounds vs Multiplier')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return sensitivity_df
# Perform sensitivity analysis
sensitivity_results = multiplier_sensitivity_analysis(df, 'target')
Implementation: Multiple Columns
Comprehensive Multi-Column Analysis
def detect_outliers_iqr_multiple(df, columns=None, multiplier=1.5, method='tukey',
strategy='any', min_outlier_columns=1):
"""
Detect outliers across multiple columns using IQR method
Parameters:
-----------
df : pandas DataFrame
Input dataframe
columns : list, optional
Columns to analyze. If None, all numeric columns
multiplier : float, default=1.5
IQR multiplier for outlier bounds
method : str, default='tukey'
Method for quartile calculation
strategy : str, default='any'
Strategy for multi-column outliers ('any', 'all', 'majority', 'min_count')
min_outlier_columns : int, default=1
Minimum columns with outliers to flag row (for 'min_count' strategy)
Returns:
--------
results : dict
Comprehensive results for each column and overall
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
# Initialize results storage
column_results = {}
outlier_masks = {}
# Process each column
for col in columns:
outlier_mask, bounds, outlier_info = detect_outliers_iqr(df[col], multiplier, method)
column_results[col] = {
'bounds': bounds,
'outlier_info': outlier_info,
'outlier_mask': outlier_mask
}
outlier_masks[col] = outlier_mask
# Combine outlier masks based on strategy
combined_mask = pd.Series(False, index=df.index)
if strategy == 'any':
# Row is outlier if it's outlier in ANY column
for col in columns:
combined_mask |= outlier_masks[col]
elif strategy == 'all':
# Row is outlier if it's outlier in ALL columns
combined_mask = pd.Series(True, index=df.index)
for col in columns:
combined_mask &= outlier_masks[col]
elif strategy == 'majority':
# Row is outlier if it's outlier in majority of columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts > (len(columns) / 2)
elif strategy == 'min_count':
# Row is outlier if it's outlier in at least min_outlier_columns
outlier_counts = pd.Series(0, index=df.index)
for col in columns:
outlier_counts += outlier_masks[col]
combined_mask = outlier_counts >= min_outlier_columns
# Calculate overall statistics
overall_stats = {
'total_rows': len(df),
'outlier_rows': combined_mask.sum(),
'outlier_percentage': (combined_mask.sum() / len(df)) * 100,
'strategy': strategy,
'columns_analyzed': columns,
'multiplier': multiplier
}
# Create outlier summary by column
column_summary = []
for col in columns:
info = column_results[col]['outlier_info']
bounds = column_results[col]['bounds']
column_summary.append({
'column': col,
'n_outliers': info['n_outliers'],
'outlier_percentage': info['outlier_percentage'],
'lower_outliers': info['lower_outliers'],
'upper_outliers': info['upper_outliers'],
'Q1': bounds['Q1'],
'Q3': bounds['Q3'],
'IQR': bounds['IQR'],
'lower_bound': bounds['lower_bound'],
'upper_bound': bounds['upper_bound']
})
column_summary_df = pd.DataFrame(column_summary)
# Create detailed outlier information
outlier_details = []
for idx in df.index[combined_mask]:
row_info = {'index': idx}
outlier_columns = []
for col in columns:
if outlier_masks[col][idx]:
outlier_columns.append(col)
row_info[f'{col}_value'] = df.loc[idx, col]
row_info[f'{col}_type'] = 'Lower' if df.loc[idx, col] < column_results[col]['bounds']['lower_bound'] else 'Upper'
row_info['outlier_columns'] = outlier_columns
row_info['n_outlier_columns'] = len(outlier_columns)
outlier_details.append(row_info)
outlier_details_df = pd.DataFrame(outlier_details)
return {
'column_results': column_results,
'column_summary': column_summary_df,
'outlier_details': outlier_details_df,
'combined_mask': combined_mask,
'overall_stats': overall_stats,
'clean_data': df[~combined_mask].copy()
}
# Apply multi-column analysis
multi_results = detect_outliers_iqr_multiple(df, multiplier=1.5, strategy='any')
print("=== MULTI-COLUMN IQR ANALYSIS ===")
print(f"Strategy: {multi_results['overall_stats']['strategy']}")
print(f"Total rows: {multi_results['overall_stats']['total_rows']}")
print(f"Outlier rows: {multi_results['overall_stats']['outlier_rows']}")
print(f"Outlier percentage: {multi_results['overall_stats']['outlier_percentage']:.2f}%")
print("\\n=== COLUMN SUMMARY ===")
print(multi_results['column_summary'].round(3))
print("\\n=== TOP 10 OUTLIER ROWS ===")
if not multi_results['outlier_details'].empty:
top_outliers = multi_results['outlier_details'].nlargest(10, 'n_outlier_columns')
print(top_outliers[['index', 'outlier_columns', 'n_outlier_columns']].head(10))
Strategy Comparison
def compare_outlier_strategies(df, columns=None, multiplier=1.5):
"""
Compare different strategies for multi-column outlier detection
"""
if columns is None:
columns = df.select_dtypes(include=[np.number]).columns.tolist()
strategies = ['any', 'all', 'majority', 'min_count']
strategy_results = {}
print("=== STRATEGY COMPARISON ===")
for strategy in strategies:
if strategy == 'min_count':
# Test different min_count values
for min_count in [1, 2, 3]:
key = f"{strategy}_{min_count}"
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy, min_outlier_columns=min_count)
strategy_results[key] = results['overall_stats']
else:
results = detect_outliers_iqr_multiple(df, columns, multiplier, strategy=strategy)
strategy_results[strategy] = results['overall_stats']
# Create comparison DataFrame
comparison_data = []
for strategy_name, stats in strategy_results.items():
comparison_data.append({
'strategy': strategy_name,
'outlier_rows': stats['outlier_rows'],
'outlier_percentage': stats['outlier_percentage'],
'remaining_rows': stats['total_rows'] - stats['outlier_rows'],
'data_retention': ((stats['total_rows'] - stats['outlier_rows']) / stats['total_rows']) * 100
})
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.round(2))
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Outlier count comparison
axes[0].bar(comparison_df['strategy'], comparison_df['outlier_rows'], color='lightcoral', alpha=0.7)
axes[0].set_xlabel('Strategy')
axes[0].set_ylabel('Number of Outliers')
axes[0].set_title('Outliers Detected by Strategy')
axes[0].tick_params(axis='x', rotation=45)
# Data retention comparison
axes[1].bar(comparison_df['strategy'], comparison_df['data_retention'], color='lightgreen', alpha=0.7)
axes[1].set_xlabel('Strategy')
axes[1].set_ylabel('Data Retention (%)')
axes[1].set_title('Data Retention by Strategy')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
return comparison_df
# Compare strategies
strategy_comparison = compare_outlier_strategies(df)
Advanced Techniques
Adaptive IQR (Domain-Specific Multipliers)
def adaptive_iqr_detection(df, column_configs=None, global_multiplier=1.5):
"""
Adaptive IQR detection with column-specific configurations
Parameters:
-----------
df : pandas DataFrame
Input dataframe
column_configs : dict, optional
Column-specific configurations
Example: {'column_name': {'multiplier': 2.0, 'bounds': 'asymmetric'}}
global_multiplier : float, default=1.5
Default multiplier for columns not in column_configs
"""
if column_configs is None:
column_configs = {}
numeric_columns = df.select_dtypes(include=[np.number]).columns
results = {}
for col in numeric_columns:
config = column_configs.get(col, {})
multiplier = config.get('multiplier', global_multiplier)
bounds_type = config.get('bounds', 'symmetric')
# Calculate basic IQR components
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
# Apply