Source code for titanicprediction.data.preprocessing
import re
from dataclasses import dataclass
from typing import Any, Literal, Protocol
import pandas as pd
from loguru import logger
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, RobustScaler,
StandardScaler)
from titanicprediction.entities.core import Dataset
[docs]
@dataclass
class AgeImputer:
strategy: Literal["mean", "median", "mode", "constant"] = "median"
fill_value: float | None = None
_imputer: Any = None
[docs]
def fit(self, dataset: Dataset) -> None:
if "Age" not in dataset.features.columns:
return
age_data = dataset.features["Age"].values.reshape(-1, 1)
if self.strategy == "constant" and self.fill_value is not None:
self._imputer = SimpleImputer(
strategy="constant", fill_value=self.fill_value
)
else:
self._imputer = SimpleImputer(strategy=self.strategy)
self._imputer.fit(age_data)
[docs]
def transform(self, dataset: Dataset) -> Dataset:
if self._imputer is None or "Age" not in dataset.features.columns:
return dataset
features = dataset.features.copy()
age_data = features["Age"].values.reshape(-1, 1)
imputed_age = self._imputer.transform(age_data)
features["Age"] = imputed_age.flatten()
return Dataset(
features=features,
target=dataset.target,
feature_names=dataset.feature_names,
target_name=dataset.target_name,
)
[docs]
def fit_transform(self, dataset: Dataset) -> Dataset:
self.fit(dataset)
return self.transform(dataset)
[docs]
def get_params(self) -> dict[str, Any]:
return {"strategy": self.strategy, "fill_value": self.fill_value}
[docs]
@dataclass
class CategoricalEncoder:
encoding_type: Literal["onehot", "label"] = "onehot"
columns: list[str] = None
handle_unknown: Literal["error", "ignore", "use_encoded_value"] = "error"
_encoders: dict[str, Any] = None
_feature_names: list[str] = None
def __post_init__(self):
self._encoders = {}
self._feature_names = []
if self.columns is None:
self.columns = []
[docs]
def fit(self, dataset: Dataset) -> None:
self._encoders = {}
self._feature_names = dataset.feature_names.copy()
for column in self.columns:
if column not in dataset.features.columns:
continue
if self.encoding_type == "onehot":
unique_values = dataset.features[column].dropna().unique()
self._encoders[column] = list(unique_values)
else:
encoder = LabelEncoder()
encoder.fit(dataset.features[column].dropna())
self._encoders[column] = encoder
[docs]
def transform(self, dataset: Dataset) -> Dataset:
features = dataset.features.copy()
for column in self.columns:
if column not in features.columns or column not in self._encoders:
continue
if self.encoding_type == "onehot":
unique_values = self._encoders[column]
for value in unique_values:
new_col_name = f"{column}_{value}"
features[new_col_name] = (
features[column] == value).astype(int)
features = features.drop(columns=[column])
self._update_feature_names(column, unique_values)
else:
encoder = self._encoders[column]
try:
encoded = encoder.transform(features[column])
features[column] = encoded
except ValueError:
if self.handle_unknown == "error":
raise
features[column] = -1
return Dataset(
features=features,
target=dataset.target,
feature_names=self._feature_names,
target_name=dataset.target_name,
)
[docs]
def fit_transform(self, dataset: Dataset) -> Dataset:
self.fit(dataset)
return self.transform(dataset)
[docs]
def get_params(self) -> dict[str, Any]:
return {
"encoding_type": self.encoding_type,
"columns": self.columns,
"handle_unknown": self.handle_unknown,
}
[docs]
def _update_feature_names(self, original_col: str, new_cols: list[str]) -> None:
if original_col in self._feature_names:
index = self._feature_names.index(original_col)
self._feature_names.remove(original_col)
new_names = [f"{original_col}_{col}" for col in new_cols]
for i, new_name in enumerate(new_names):
self._feature_names.insert(index + i, new_name)
[docs]
@dataclass
class FeatureScaler:
method: Literal["standard", "minmax", "robust"] = "standard"
columns: list[str] = None
with_mean: bool = True
with_std: bool = True
_scalers: dict[str, Any] = None
def __post_init__(self):
self._scalers = {}
if self.columns is None:
self.columns = []
[docs]
def fit(self, dataset: Dataset) -> None:
self._scalers = {}
for column in self.columns:
if column not in dataset.features.columns:
continue
data = dataset.features[column].values.reshape(-1, 1)
if self.method == "standard":
scaler = StandardScaler(
with_mean=self.with_mean, with_std=self.with_std
)
elif self.method == "minmax":
scaler = MinMaxScaler()
else:
scaler = RobustScaler()
scaler.fit(data)
self._scalers[column] = scaler
[docs]
def transform(self, dataset: Dataset) -> Dataset:
features = dataset.features.copy()
for column in self.columns:
if column not in features.columns or column not in self._scalers:
continue
scaler = self._scalers[column]
data = features[column].values.reshape(-1, 1)
scaled_data = scaler.transform(data)
features[column] = scaled_data.flatten()
return Dataset(
features=features,
target=dataset.target,
feature_names=dataset.feature_names,
target_name=dataset.target_name,
)
[docs]
def fit_transform(self, dataset: Dataset) -> Dataset:
self.fit(dataset)
return self.transform(dataset)
[docs]
def get_params(self) -> dict[str, Any]:
return {
"method": self.method,
"columns": self.columns,
"with_mean": self.with_mean,
"with_std": self.with_std,
}
[docs]
@dataclass
class TitleExtractor:
name_column: str = "Name"
title_column: str = "Title"
custom_mappings: dict[str, str] = None
_title_patterns: dict[str, str] = None
def __post_init__(self):
if self.custom_mappings is None:
self.custom_mappings = {}
self._title_patterns = {
"Mr": r"\bMr\.",
"Mrs": r"\bMrs\.",
"Miss": r"\bMiss\.",
"Master": r"\bMaster\.",
"Dr": r"\bDr\.",
"Rev": r"\bRev\.",
}
[docs]
def transform(self, dataset: Dataset) -> Dataset:
features = dataset.features.copy()
if self.name_column not in features.columns:
return dataset
def extract_title(name):
if pd.isna(name):
return "Unknown"
for title, pattern in self._title_patterns.items():
if re.search(pattern, name, re.IGNORECASE):
return title
return "Other"
features[self.title_column] = features[self.name_column].apply(
extract_title)
for original, mapped in self.custom_mappings.items():
features[self.title_column] = features[self.title_column].replace(
original, mapped
)
new_feature_names = [*dataset.feature_names, self.title_column]
return Dataset(
features=features,
target=dataset.target,
feature_names=new_feature_names,
target_name=dataset.target_name,
)
[docs]
def get_params(self) -> dict[str, Any]:
return {
"name_column": self.name_column,
"title_column": self.title_column,
"custom_mappings": self.custom_mappings,
}
[docs]
class DataPreprocessor:
[docs]
def __init__(self):
self.preprocessing_steps: list[tuple[str, IDataTransformer]] = []
self.fitted: bool = False
[docs]
def add_step(self, name: str, transformer: IDataTransformer) -> None:
self.preprocessing_steps.append((name, transformer))
[docs]
def fit(self, dataset: Dataset) -> None:
current_dataset = dataset
for name, transformer in self.preprocessing_steps:
logger.info(f"Fitting transformer: {name}")
transformer.fit(current_dataset)
self.fitted = True
[docs]
def transform(self, dataset: Dataset) -> Dataset:
if not self.fitted:
raise ValueError(
"Preprocessor must be fitted before transformation")
current_dataset = dataset
for name, transformer in self.preprocessing_steps:
logger.info(f"Applying transformer: {name}")
current_dataset = transformer.transform(current_dataset)
return current_dataset
[docs]
def fit_transform(self, dataset: Dataset) -> Dataset:
self.fit(dataset)
return self.transform(dataset)
[docs]
def transform_features(self, features: pd.DataFrame) -> pd.DataFrame:
dummy_dataset = Dataset(
features=features,
target=None,
feature_names=list(features.columns),
target_name="dummy",
)
transformed = self.transform(dummy_dataset)
return transformed.features
[docs]
def get_params(self) -> dict[str, Any]:
params = {}
for name, transformer in self.preprocessing_steps:
params[name] = transformer.get_params()
return params
[docs]
@dataclass
class ColumnDropper:
columns: list[str] = None
def __post_init__(self):
if self.columns is None:
self.columns = []
[docs]
def transform(self, dataset: Dataset) -> Dataset:
features = dataset.features.copy()
columns_to_drop = [
col for col in self.columns if col in features.columns]
if columns_to_drop:
features = features.drop(columns=columns_to_drop)
new_feature_names = [
name for name in dataset.feature_names if name not in columns_to_drop
]
else:
new_feature_names = dataset.feature_names
return Dataset(
features=features,
target=dataset.target,
feature_names=new_feature_names,
target_name=dataset.target_name,
)
[docs]
class PreprocessorFactory:
[docs]
@staticmethod
def create_titanic_preprocessor() -> DataPreprocessor:
preprocessor = DataPreprocessor()
preprocessor.add_step(
"title_extractor",
TitleExtractor(
name_column="Name",
title_column="Title",
custom_mappings={"Ms": "Mrs", "Mlle": "Miss", "Mme": "Mrs"},
),
)
preprocessor.add_step("age_imputer", AgeImputer(strategy="median"))
preprocessor.add_step(
"drop_passenger_id",
ColumnDropper(columns=["PassengerId", "Name", "Ticket", "Cabin"]),
)
preprocessor.add_step(
"categorical_encoder",
CategoricalEncoder(
encoding_type="onehot",
columns=["Sex", "Embarked", "Title"],
handle_unknown="ignore",
),
)
preprocessor.add_step(
"feature_scaler",
FeatureScaler(
method="standard",
columns=["Age", "Fare", "SibSp", "Parch", "Pclass"],
with_mean=True,
with_std=True,
),
)
return preprocessor