generic_data_profiling_routine.py
Toggle Theme
from typing import Dict, Any, Tuple
import pandas as pd
def check_null_ratio(series: pd.Series, max_ratio: float) -> str | None:
"""Check if null ratio exceeds threshold."""
ratio = series.isnull().mean()
if ratio > max_ratio:
return f"High null ratio: {ratio:.2f}"
return None
def check_value_range(series: pd.Series, expected_range: Tuple[float, float]) -> str | None:
"""Check if numeric values fall outside expected range."""
if not pd.api.types.is_numeric_dtype(series):
return None
min_val, max_val = series.min(), series.max()
if min_val < expected_range[0] or max_val > expected_range[1]:
return f"Value out of range: ({min_val}, {max_val})"
return None
def profile_column(series: pd.Series, rules: Dict[str, Any]) -> str | None:
"""Apply profiling checks to a single column."""
alert = check_null_ratio(series, rules.get("max_null_ratio", 1.0))
if alert:
return alert
if "value_range" in rules:
return check_value_range(series, rules["value_range"])
return None
def profile_data(df: pd.DataFrame, thresholds: Dict[str, Any]) -> Dict[str, str]:
"""Profile DataFrame and return alerts for rule violations."""
alerts = {}
for col in df.columns:
alert = profile_column(df[col], thresholds.get(col, {}))
if alert:
alerts[col] = alert
return alerts
def main() -> None:
# Example data ingestion
df = pd.DataFrame({
"age": [25, 30, None, 45, 120],
"income": [50000, 60000, 70000, None, 250000]
})
# Example alert configuration
thresholds = {
"age": {"max_null_ratio": 0.1, "value_range": (18, 99)},
"income": {"max_null_ratio": 0.2, "value_range": (10000, 200000)}
}
alerts = profile_data(df, thresholds)
for col, msg in alerts.items():
print(f"{col}: {msg}")
if __name__ == "__main__":
main()