from dataclasses import dataclass
from enum import Enum
from typing import Any, Literal, Union
import numpy as np
import pandas as pd
AgeType = Union[float, None]
SurvivalType = Union[bool, None]
ProbabilityType = float
PercentageType = float
SexType = Literal["male", "female"]
PclassType = Literal[1, 2, 3]
EmbarkedType = Literal["C", "Q", "S"]
TitleType = Literal["Mr", "Mrs", "Miss", "Master", "Dr", "Rev", "Other"]
[docs]
class Gender(Enum):
male = 0
female = 1
[docs]
@dataclass
class Passenger:
passenger_id: int
survived: SurvivalType
pclass: PclassType
name: str
sex: SexType
age: AgeType
sibsp: int
parch: int
fare: float
embarked: EmbarkedType | None
cabin: str | None
title: TitleType | None
ticket: str | None
[docs]
def validate(self) -> bool:
if not self.name or not self.sex:
return False
if self.age is not None and (self.age < 0 or self.age > 120):
return False
if self.fare < 0:
return False
if self.sibsp < 0 or self.parch < 0:
return False
return self.pclass in [1, 2, 3]
[docs]
def is_valid(self) -> bool:
required_fields = ["pclass", "sex", "age", "sibsp", "parch", "fare"]
for field in required_fields:
value = getattr(self, field)
if value is None:
return False
return self.age is not None and self.fare is not None
[docs]
def get_missing_fields(self) -> list[str]:
missing = []
if not self.name:
missing.append("name")
if not self.sex:
missing.append("sex")
if self.age is None:
missing.append("age")
if self.fare is None:
missing.append("fare")
if self.embarked is None:
missing.append("embarked")
if self.age is not None and (self.age < 0 or self.age > 120):
missing.append("age (invalid range)")
if self.fare < 0:
missing.append("fare (negative)")
if self.sibsp < 0:
missing.append("sibsp (negative)")
if self.parch < 0:
missing.append("parch (negative)")
return missing
[docs]
@dataclass
class Dataset:
features: pd.DataFrame
target: pd.Series | None
feature_names: list[str]
target_name: str | None
metadata: dict[str, Any] = None
[docs]
def get_shape(self) -> tuple[int, int]:
return self.features.shape
[docs]
def get_feature_types(self) -> dict[str, str]:
return self.features.dtypes.astype(str).to_dict()
[docs]
def split(self, ratio: float) -> tuple["Dataset", "Dataset"]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
self.features, self.target, test_size=1 - ratio, random_state=42
)
train_dataset = Dataset(
features=X_train,
target=y_train,
feature_names=self.feature_names,
target_name=self.target_name,
)
test_dataset = Dataset(
features=X_test,
target=y_test,
feature_names=self.feature_names,
target_name=self.target_name,
)
return train_dataset, test_dataset
[docs]
def describe(self) -> dict[str, Any]:
return {
"shape": self.get_shape(),
"feature_types": self.get_feature_types(),
"target_distribution": (
self.target.value_counts().to_dict() if self.target is not None else {}
),
"missing_values": self.features.isnull().sum().to_dict(),
}
[docs]
@dataclass
class TrainedModel:
weights: np.ndarray
bias: float
feature_names: list[str]
training_metrics: dict[str, float]
validation_metrics: dict[str, float]
training_history: list[float]
model_config: dict[str, Any]
preprocessing_artifacts: dict[str, Any] | None = None
[docs]
def predict(self, features: np.ndarray) -> np.ndarray:
if features.shape[1] != len(self.feature_names):
raise ValueError(
f"Feature dimension mismatch: model expects {len(self.feature_names)} features, "
f"but got {features.shape[1]}. Feature names: {self.feature_names}"
)
linear_output = np.dot(features, self.weights) + self.bias
probabilities = 1 / (1 + np.exp(-np.clip(linear_output, -500, 500)))
return (probabilities >= 0.5).astype(int)
[docs]
def predict_proba(self, features: np.ndarray) -> np.ndarray:
if features.shape[1] != len(self.feature_names):
raise ValueError(
f"Expected {len(self.feature_names)} features, got {features.shape[1]}"
)
linear_output = np.dot(features, self.weights) + self.bias
probabilities = 1 / (1 + np.exp(-np.clip(linear_output, -500, 500)))
return np.column_stack([1 - probabilities, probabilities])
[docs]
def get_feature_importance(self) -> dict[str, float]:
if len(self.weights) != len(self.feature_names):
raise ValueError("Weights and feature_names length mismatch")
absolute_weights = np.abs(self.weights)
total_importance = np.sum(absolute_weights)
if total_importance > 0:
importance_dict = {
feature: float(weight / total_importance * 100)
for feature, weight in zip(
self.feature_names, absolute_weights, strict=False
)
}
else:
importance_dict = dict.fromkeys(self.feature_names, 0.0)
return importance_dict
[docs]
@dataclass(frozen=True)
class FeatureImpactAnalysis:
feature_name: str
impact_score: float
weight: float
feature_value: float
contribution: float
[docs]
@dataclass(frozen=True)
class PredictionExplanation:
prediction: bool
probability: float
feature_impacts: list[FeatureImpactAnalysis]
decision_factors: list[str]
confidence_level: str