Source code for titanicprediction.entities.core

from dataclasses import dataclass
from enum import Enum
from typing import Any, Literal, Union

import numpy as np
import pandas as pd

AgeType = Union[float, None]
SurvivalType = Union[bool, None]
ProbabilityType = float
PercentageType = float

SexType = Literal["male", "female"]
PclassType = Literal[1, 2, 3]
EmbarkedType = Literal["C", "Q", "S"]
TitleType = Literal["Mr", "Mrs", "Miss", "Master", "Dr", "Rev", "Other"]


[docs] class Gender(Enum): male = 0 female = 1
[docs] @dataclass class Passenger: passenger_id: int survived: SurvivalType pclass: PclassType name: str sex: SexType age: AgeType sibsp: int parch: int fare: float embarked: EmbarkedType | None cabin: str | None title: TitleType | None ticket: str | None
[docs] def validate(self) -> bool: if not self.name or not self.sex: return False if self.age is not None and (self.age < 0 or self.age > 120): return False if self.fare < 0: return False if self.sibsp < 0 or self.parch < 0: return False return self.pclass in [1, 2, 3]
[docs] def is_valid(self) -> bool: required_fields = ["pclass", "sex", "age", "sibsp", "parch", "fare"] for field in required_fields: value = getattr(self, field) if value is None: return False return self.age is not None and self.fare is not None
[docs] def get_missing_fields(self) -> list[str]: missing = [] if not self.name: missing.append("name") if not self.sex: missing.append("sex") if self.age is None: missing.append("age") if self.fare is None: missing.append("fare") if self.embarked is None: missing.append("embarked") if self.age is not None and (self.age < 0 or self.age > 120): missing.append("age (invalid range)") if self.fare < 0: missing.append("fare (negative)") if self.sibsp < 0: missing.append("sibsp (negative)") if self.parch < 0: missing.append("parch (negative)") return missing
[docs] @dataclass class Dataset: features: pd.DataFrame target: pd.Series | None feature_names: list[str] target_name: str | None metadata: dict[str, Any] = None
[docs] def get_shape(self) -> tuple[int, int]: return self.features.shape
[docs] def get_feature_types(self) -> dict[str, str]: return self.features.dtypes.astype(str).to_dict()
[docs] def split(self, ratio: float) -> tuple["Dataset", "Dataset"]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( self.features, self.target, test_size=1 - ratio, random_state=42 ) train_dataset = Dataset( features=X_train, target=y_train, feature_names=self.feature_names, target_name=self.target_name, ) test_dataset = Dataset( features=X_test, target=y_test, feature_names=self.feature_names, target_name=self.target_name, ) return train_dataset, test_dataset
[docs] def describe(self) -> dict[str, Any]: return { "shape": self.get_shape(), "feature_types": self.get_feature_types(), "target_distribution": ( self.target.value_counts().to_dict() if self.target is not None else {} ), "missing_values": self.features.isnull().sum().to_dict(), }
[docs] @dataclass class TrainedModel: weights: np.ndarray bias: float feature_names: list[str] training_metrics: dict[str, float] validation_metrics: dict[str, float] training_history: list[float] model_config: dict[str, Any] preprocessing_artifacts: dict[str, Any] | None = None
[docs] def predict(self, features: np.ndarray) -> np.ndarray: if features.shape[1] != len(self.feature_names): raise ValueError( f"Feature dimension mismatch: model expects {len(self.feature_names)} features, " f"but got {features.shape[1]}. Feature names: {self.feature_names}" ) linear_output = np.dot(features, self.weights) + self.bias probabilities = 1 / (1 + np.exp(-np.clip(linear_output, -500, 500))) return (probabilities >= 0.5).astype(int)
[docs] def predict_proba(self, features: np.ndarray) -> np.ndarray: if features.shape[1] != len(self.feature_names): raise ValueError( f"Expected {len(self.feature_names)} features, got {features.shape[1]}" ) linear_output = np.dot(features, self.weights) + self.bias probabilities = 1 / (1 + np.exp(-np.clip(linear_output, -500, 500))) return np.column_stack([1 - probabilities, probabilities])
[docs] def get_feature_importance(self) -> dict[str, float]: if len(self.weights) != len(self.feature_names): raise ValueError("Weights and feature_names length mismatch") absolute_weights = np.abs(self.weights) total_importance = np.sum(absolute_weights) if total_importance > 0: importance_dict = { feature: float(weight / total_importance * 100) for feature, weight in zip( self.feature_names, absolute_weights, strict=False ) } else: importance_dict = dict.fromkeys(self.feature_names, 0.0) return importance_dict
[docs] @dataclass(frozen=True) class FeatureImpactAnalysis: feature_name: str impact_score: float weight: float feature_value: float contribution: float
[docs] @dataclass(frozen=True) class PredictionExplanation: prediction: bool probability: float feature_impacts: list[FeatureImpactAnalysis] decision_factors: list[str] confidence_level: str