Source code for src.utils.data_utils

#!/usr/bin/env python3
"""
Shared data loading and preprocessing utilities.

This version supports multiple election cycles (e.g. 2012, 2016, 2020) and
both the original 2016 timeseries file and FiveThirtyEight-style long polls
files (like 2020_president_polls.csv).
"""
# mypy: ignore-errors

from typing import Dict, List, Optional
import pandas as pd  # type: ignore[import-untyped]
import numpy as np

# ---------------------------------------------------------------------
# Global election configuration
# ---------------------------------------------------------------------

CURRENT_ELECTION_YEAR: int = 2016
CURRENT_POLLS_FILE: Optional[str] = None  # if None, use sensible default per year



[docs]
def set_election_config(year: int = 2016, polls_file: Optional[str] = None) -> None:
    """
    Configure which election cycle the rest of the module should use.

    Args:
        year: Election year (e.g. 2012, 2016, 2020).
        polls_file: Optional path to a FiveThirtyEight-style polls CSV.
            If None, we:
              - use the original 2016 timeseries file for year=2016
              - otherwise fall back to f"data/polls/{year}_president_polls.csv"
    """
    global CURRENT_ELECTION_YEAR, CURRENT_POLLS_FILE
    CURRENT_ELECTION_YEAR = int(year)
    CURRENT_POLLS_FILE = polls_file




[docs]
def get_election_date(year: int) -> str:
    """
    Return the election day (YYYY-MM-DD) for a given year.
    """
    election_dates = {
        2012: "2012-11-06",
        2016: "2016-11-08",
        2020: "2020-11-03",
    }
    return election_dates.get(year, f"{year}-11-01")




[docs]
def get_current_election_date() -> str:
    """
    Convenience wrapper that uses the currently configured election year.
    """
    return get_election_date(CURRENT_ELECTION_YEAR)



# ---------------------------------------------------------------------
# Common state name → postal abbreviation mapping
# ---------------------------------------------------------------------

_STATE_NAME_TO_ABBREV: Dict[str, str] = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "District of Columbia": "DC",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}


# ---------------------------------------------------------------------
# 2016-specific polling loader (original timeseries)
# ---------------------------------------------------------------------


def _load_polling_data_2016() -> pd.DataFrame:
    """
    Load and preprocess 2016 polling data from FiveThirtyEight timeseries.

    Returns:
        DataFrame with columns:
            middate, dem, rep, margin, dem_proportion,
            samplesize, pollster, state_code
    """
    polls = pd.read_csv("data/polls/fivethirtyeight_2016_polls_timeseries.csv")
    polls["startdate"] = pd.to_datetime(polls["startdate"])
    polls["enddate"] = pd.to_datetime(polls["enddate"])
    polls["middate"] = polls["startdate"] + (polls["enddate"] - polls["startdate"]) / 2

    polls["dem"] = polls["rawpoll_clinton"]
    polls["rep"] = polls["rawpoll_trump"]
    polls["total"] = polls["dem"] + polls["rep"]

    mask = polls["total"] > 0
    polls = polls.loc[mask].copy()

    polls["margin"] = (polls["dem"] - polls["rep"]) / polls["total"]
    polls["dem_proportion"] = polls["dem"] / polls["total"]

    polls["state_code"] = polls["state"].map(_STATE_NAME_TO_ABBREV)

    return polls


# ---------------------------------------------------------------------
# Generic FiveThirtyEight-style polling loader (e.g. 2020)
# ---------------------------------------------------------------------


def _load_polling_data_fte_long(polls_file: str, cycle: int) -> pd.DataFrame:
    """
    Load and preprocess raw FiveThirtyEight-style presidential polling data.

    Designed for files like `2020_president_polls.csv` where each row is a
    (poll, candidate) combo and support is in a `pct` column.

    Args:
        polls_file: Path to CSV.
        cycle: Election cycle year (used to filter the `cycle` column).

    Returns:
        DataFrame with the same core columns as `_load_polling_data_2016`.
    """
    polls_raw = pd.read_csv(polls_file)
    polls = polls_raw.copy()

    # Filter by cycle if present
    if "cycle" in polls.columns:
        polls = polls[polls["cycle"] == cycle]

    # Keep only presidential general election polls if columns exist
    if "office_type" in polls.columns:
        polls = polls[polls["office_type"].str.contains("President", na=False)]

    if "stage" in polls.columns:
        polls = polls[polls["stage"] == "general"]

    # Require a state
    polls = polls[polls["state"].notna()].copy()

    # We expect candidate_party & pct
    if "candidate_party" not in polls.columns or "pct" not in polls.columns:
        raise ValueError(
            "Expected columns `candidate_party` and `pct` in polls file "
            f"{polls_file}, but they were not found."
        )

    polls = polls[polls["candidate_party"].isin(["DEM", "REP"])].copy()

    # Parse dates
    polls["start_date"] = pd.to_datetime(polls["start_date"])
    polls["end_date"] = pd.to_datetime(polls["end_date"])

    index_cols = [
        "poll_id",
        "state",
        "pollster",
        "sample_size",
        "start_date",
        "end_date",
    ]

    table = polls.pivot_table(
        index=index_cols,
        columns="candidate_party",
        values="pct",
        aggfunc="mean",
    )

    wide = table.reset_index()

    # Rename columns to align with 2016 loader
    wide.rename(
        columns={
            "DEM": "dem",
            "REP": "rep",
            "sample_size": "samplesize",
            "start_date": "startdate",
            "end_date": "enddate",
        },
        inplace=True,
    )

    wide["total"] = wide["dem"] + wide["rep"]
    wide = wide[wide["total"] > 0].copy()

    wide["margin"] = (wide["dem"] - wide["rep"]) / wide["total"]
    wide["dem_proportion"] = wide["dem"] / wide["total"]
    wide["middate"] = wide["startdate"] + (wide["enddate"] - wide["startdate"]) / 2

    wide["state_code"] = wide["state"].map(_STATE_NAME_TO_ABBREV)

    cols = [
        "middate",
        "dem",
        "rep",
        "margin",
        "dem_proportion",
        "samplesize",
        "pollster",
        "state_code",
        "startdate",
        "enddate",
        "total",
    ]
    wide = wide[cols]

    return wide


# ---------------------------------------------------------------------
# Public polling loaders used by models
# ---------------------------------------------------------------------



[docs]
def load_polling_data() -> pd.DataFrame:
    """
    Load polling data for the currently configured election.

    Behaviour:
      - If CURRENT_ELECTION_YEAR == 2016 and CURRENT_POLLS_FILE is None,
        this uses the original `_load_polling_data_2016()` to preserve
        backwards compatibility.
      - Otherwise, it expects a FiveThirtyEight-style CSV (either provided via
        CURRENT_POLLS_FILE or inferred as `data/polls/{year}_president_polls.csv`)
        and parses it with `_load_polling_data_fte_long`.
    """
    year = CURRENT_ELECTION_YEAR
    polls_file = CURRENT_POLLS_FILE

    if year == 2016 and polls_file is None:
        return _load_polling_data_2016()

    if polls_file is None:
        polls_file = f"data/polls/{year}_president_polls.csv"

    return _load_polling_data_fte_long(polls_file=polls_file, cycle=year)



# ---------------------------------------------------------------------
# Election results loaders
# ---------------------------------------------------------------------


def _load_election_results_year(year: int) -> Dict[str, float]:
    """
    Load actual election results for a given year from MIT Election Lab.

    Returns:
        dict mapping state code to actual Democratic margin for that year.
    """
    results = pd.read_csv(
        "data/election_results/mit_president_state_1976_2020.csv", sep="\t"
    )
    results_year = results[results["year"] == year].copy()

    state_results = (
        results_year.groupby(["state_po", "party_simplified"])
        .agg({"candidatevotes": "sum"})
        .reset_index()
    )
    dem = state_results[state_results["party_simplified"] == "DEMOCRAT"].set_index(
        "state_po"
    )["candidatevotes"]
    rep = state_results[state_results["party_simplified"] == "REPUBLICAN"].set_index(
        "state_po"
    )["candidatevotes"]

    actual_margin = ((dem - rep) / (dem + rep)).to_dict()

    return actual_margin


def _load_election_results_2016() -> Dict[str, float]:
    """
    Backwards-compatible wrapper for 2016 results.
    """
    return _load_election_results_year(2016)



[docs]
def load_election_results() -> Dict[str, float]:
    """
    Public wrapper used by models.

    Uses the currently configured election year.
    """
    return _load_election_results_year(CURRENT_ELECTION_YEAR)



# ---------------------------------------------------------------------
# Fundamentals prior (unchanged)
# ---------------------------------------------------------------------



[docs]
def load_fundamentals() -> Dict[str, Dict[str, float]]:
    """
    Load historical election results for fundamentals prior.

    Computes weighted average of 2012 (70%) and 2008 (30%) results.

    NOTE: This is still the same 2016-oriented prior as in the original
    project. If you want a 2012 or 2020-specific fundamentals prior, you
    can generalise this function further (e.g. use (2008, 2004) for 2012,
    or (2016, 2012) for 2020).

    Returns:
        dict mapping state code to fundamentals dict with keys:
            margin, margin_2012, margin_2008
    """
    results = pd.read_csv(
        "data/election_results/mit_president_state_1976_2020.csv", sep="\t"
    )

    fundamentals: Dict[str, Dict[str, float]] = {}

    for state in results["state_po"].unique():
        state_results = results[results["state_po"] == state]

        # Get 2012 and 2008 results
        margins_2012: Dict[str, float] = {}
        margins_2008: Dict[str, float] = {}

        for year, margins_dict in [(2012, margins_2012), (2008, margins_2008)]:
            year_results = state_results[state_results["year"] == year]
            year_grouped = year_results.groupby("party_simplified")[
                "candidatevotes"
            ].sum()

            if "DEMOCRAT" in year_grouped.index and "REPUBLICAN" in year_grouped.index:
                dem = year_grouped["DEMOCRAT"]
                rep = year_grouped["REPUBLICAN"]
                margins_dict[state] = (dem - rep) / (dem + rep)

        # Compute weighted average (70% weight on 2012)
        if state in margins_2012 and state in margins_2008:
            fundamentals[state] = {
                "margin": 0.7 * margins_2012[state] + 0.3 * margins_2008[state],
                "margin_2012": margins_2012[state],
                "margin_2008": margins_2008[state],
            }
        elif state in margins_2012:
            fundamentals[state] = {
                "margin": margins_2012[state],
                "margin_2012": margins_2012[state],
                "margin_2008": 0.0,
            }

    return fundamentals



# ---------------------------------------------------------------------
# Misc helpers
# ---------------------------------------------------------------------



[docs]
def get_state_list(polls: pd.DataFrame, actual_results: Dict[str, float]) -> List[str]:
    """
    Get list of states with sufficient polling data.

    Args:
        polls: DataFrame of polling data
        actual_results: dict of actual election results

    Returns:
        list of state codes
    """
    states = [
        s for s in polls["state_code"].unique() if pd.notna(s) and s in actual_results
    ]
    return states




[docs]
def compute_metrics(predictions_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute evaluation metrics from predictions.

    Args:
        predictions_df: DataFrame with columns:
            forecast_date, win_probability, predicted_margin, actual_margin

    Returns:
        DataFrame with columns:
            forecast_date, n_states, brier_score, log_loss, mae_margin
    """
    metrics = []
    forecast_dates = predictions_df["forecast_date"].unique()

    for fdate in forecast_dates:
        subset = predictions_df[predictions_df["forecast_date"] == fdate].copy()
        subset["actual_win"] = (subset["actual_margin"] > 0).astype(int)
        subset = subset[subset["actual_margin"].notna()]

        if len(subset) == 0:
            continue

        brier = np.mean((subset["win_probability"] - subset["actual_win"]) ** 2)
        eps = 1e-10
        log_loss = -np.mean(
            subset["actual_win"] * np.log(subset["win_probability"] + eps)
            + (1 - subset["actual_win"]) * np.log(1 - subset["win_probability"] + eps)
        )
        mae = np.mean(np.abs(subset["predicted_margin"] - subset["actual_margin"]))

        metrics.append(
            {
                "forecast_date": pd.to_datetime(fdate).date(),
                "n_states": int(len(subset)),
                "brier_score": float(brier),
                "log_loss": float(log_loss),
                "mae_margin": float(mae),
            }
        )

    return pd.DataFrame(metrics)