A lightweight Python library for automated data leakage detection in ML datasets.
pip install leakcheckFor development:
pip install -e ".[dev]"import pandas as pd
import leakcheck as lc
# Your train and test datasets
train = pd.DataFrame({
'user_id': [1, 2, 3, 4, 5],
'feature': [10, 20, 30, 40, 50],
'target': [0, 1, 0, 1, 0]
})
test = pd.DataFrame({
'user_id': [1, 6, 7], # user_id 1 overlaps!
'feature': [10, 60, 70],
'target': [0, 1, 1]
})
# Run leakage detection
report = lc.detect_leakage(train, test, target_col='target', id_cols=['user_id'])
print(report.summary())- Duplicate Detection: Find exact row duplicates and ID overlaps between train/test sets
- Target Leakage Detection: Identify features with suspiciously high correlation to the target
- Temporal Leakage Detection: Detect future data contamination in time series splits
lc.detect_leakage(
train_df, # Training DataFrame
test_df, # Test DataFrame
target_col, # Name of target column
id_cols=None, # List of ID columns to check for overlap
date_col=None, # Date column for temporal checks
**options # Additional options for individual checks
)from leakcheck.checks import check_duplicates, check_target_leakage, check_temporal
# Check for duplicate rows and ID overlap
issue = check_duplicates(train_df, test_df, id_cols=['user_id'])
# Check for target leakage
issues = check_target_leakage(df, target_col='target', threshold=0.8)
# Check for temporal leakage
issue = check_temporal(train_df, test_df, date_col='date')MIT