Source code for geostep.designer

from typing import List

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans  # type: ignore
from sklearn.preprocessing import StandardScaler  # type: ignore

from .base import BaseDesigner
from .constants import CONTROL, TREATMENT
from .exceptions import DesignError
from .validation import (
    validate_columns_exist,
    validate_dataframe,
    validate_positive_number,
)



[docs]
class StaircaseDesigner(BaseDesigner):

[docs]
    def __init__(
        self,
        num_sequences: int,
        clusters_per_sequence: int,
        control_periods: int,
        intervention_periods: int,
    ):
        super().__init__()
        self.num_sequences = num_sequences
        self.clusters_per_sequence = clusters_per_sequence
        self.control_periods = control_periods
        self.intervention_periods = intervention_periods



[docs]
    def validate_inputs(self, *args, **kwargs) -> None:
        """Validate staircase design parameters.
        
        Raises
        ------
        ValidationError
            If any design parameter is invalid.
        """
        validate_positive_number(self.num_sequences, "num_sequences")
        validate_positive_number(self.clusters_per_sequence, "clusters_per_sequence")
        validate_positive_number(self.control_periods, "control_periods")
        validate_positive_number(self.intervention_periods, "intervention_periods")



[docs]
    def prepare_data(self, *args, **kwargs) -> None:
        """Prepare data for staircase design.
        
        For staircase design, no data preparation is needed as it's parameter-based.
        """
        pass


    def _design_impl(self, *args, **kwargs) -> pd.DataFrame:
        """Create the staircase design matrix.

        Returns
        -------
        pd.DataFrame
            A design matrix with columns for sequence, cluster, period, and assignment.
            'assignment' is 'Control' or 'Treatment'.
        """
        design = []
        total_clusters = 0

        for s in range(1, self.num_sequences + 1):
            for k in range(1, self.clusters_per_sequence + 1):
                cluster_id = total_clusters + k
                start_period = s
                end_period = s + self.control_periods + self.intervention_periods - 1
                for t in range(start_period, end_period + 1):
                    switch_period = s + self.control_periods
                    assignment = TREATMENT if t >= switch_period else CONTROL
                    design.append(
                        {
                            "sequence": s,
                            "cluster_id": cluster_id,
                            "period": t,
                            "assignment": assignment,
                        }
                    )
            total_clusters += self.clusters_per_sequence
        return pd.DataFrame(design)


[docs]
    def design(self) -> pd.DataFrame:
        """Creates a design matrix for a staircase cluster randomized trial.

        Returns
        -------
        pd.DataFrame
            A design matrix with columns for sequence, cluster, period, and assignment.
            'assignment' is 'Control' or 'Treatment'.

        Raises
        ------
        ValidationError
            If input validation fails.
        """
        return super().design()





[docs]
class SimpleRandomizationDesigner(BaseDesigner):

[docs]
    def __init__(self, num_groups: int = 2, seed: int = 42):
        super().__init__()
        self.num_groups = num_groups
        self.seed = seed
        self._df = None
        self._geo_col = None
        self._unique_geos = None



[docs]
    def validate_inputs(self, df: pd.DataFrame = None, geo_col: str = None, **kwargs) -> None:
        """Validate inputs for simple randomization design.
        
        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing the geographic units to randomize.
        geo_col : str
            The name of the column containing the unique geographic identifiers.
            
        Raises
        ------
        ValidationError
            If input validation fails.
        DesignError
            If num_groups is not 2.
        """
        if df is not None:
            self._df = validate_dataframe(df, "df")
        if geo_col is not None:
            self._geo_col = geo_col
            
        if self._df is not None and self._geo_col is not None:
            validate_columns_exist(self._df, [self._geo_col])
            
        if self.num_groups != 2:
            raise DesignError(
                "SimpleRandomizationDesigner currently only supports 2 groups "
                "(Treatment/Control)"
            )



[docs]
    def prepare_data(self, df: pd.DataFrame = None, geo_col: str = None, **kwargs) -> None:
        """Prepare data for simple randomization.
        
        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing the geographic units to randomize.
        geo_col : str
            The name of the column containing the unique geographic identifiers.
        """
        if df is not None:
            self._df = df
        if geo_col is not None:
            self._geo_col = geo_col
            
        if self._df is not None and self._geo_col is not None:
            self._unique_geos = self._df[self._geo_col].unique()


    def _design_impl(self, df: pd.DataFrame = None, geo_col: str = None, **kwargs) -> pd.DataFrame:
        """Perform the simple randomization assignment.

        Returns
        -------
        pd.DataFrame
            A DataFrame with a new 'assignment' column.
        """
        np.random.seed(self.seed)
        assignments = np.random.randint(0, self.num_groups, size=len(self._unique_geos))
        assignment_map = {
            geo: TREATMENT if assign == 1 else CONTROL
            for geo, assign in zip(self._unique_geos, assignments)
        }
        df_assigned = self._df.copy()
        df_assigned["assignment"] = df_assigned[self._geo_col].map(assignment_map)
        return df_assigned


[docs]
    def design(self, df: pd.DataFrame, geo_col: str) -> pd.DataFrame:
        """Assigns geographic units to experimental groups using simple randomization.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing the geographic units to randomize.
        geo_col : str
            The name of the column containing the unique geographic identifiers.

        Returns
        -------
        pd.DataFrame
            A DataFrame with a new 'assignment' column.

        Raises
        ------
        ValidationError
            If input validation fails.
        """
        return super().design(df=df, geo_col=geo_col)





[docs]
class StratifiedRandomizationDesigner(BaseDesigner):

[docs]
    def __init__(self, num_groups: int = 2, n_strata: int = 4, seed: int = 42):
        super().__init__()
        self.num_groups = num_groups
        self.n_strata = n_strata
        self.seed = seed
        self._df = None
        self._geo_col = None
        self._strat_vars = None
        self._geo_df = None
        self._stratum_map = None



[docs]
    def validate_inputs(
        self, df: pd.DataFrame = None, geo_col: str = None, strat_vars: List[str] = None, **kwargs
    ) -> None:
        """Validate inputs for stratified randomization design.
        
        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing geographic units and stratification variables.
        geo_col : str
            The name of the column with unique geographic identifiers.
        strat_vars : list of str
            A list of column names to use for stratification.
            
        Raises
        ------
        ValidationError
            If input validation fails.
        DesignError
            If num_groups is not 2 or n_strata is invalid.
        """
        if df is not None:
            self._df = validate_dataframe(df, "df")
        if geo_col is not None:
            self._geo_col = geo_col
        if strat_vars is not None:
            self._strat_vars = strat_vars
            
        if self._df is not None and self._geo_col is not None and self._strat_vars is not None:
            validate_columns_exist(self._df, [self._geo_col] + self._strat_vars)
            
        if self.num_groups != 2:
            raise DesignError(
                "StratifiedRandomizationDesigner currently only supports 2 groups "
                "(Treatment/Control)"
            )
        validate_positive_number(self.n_strata, "n_strata")



[docs]
    def prepare_data(
        self, df: pd.DataFrame = None, geo_col: str = None, strat_vars: List[str] = None, **kwargs
    ) -> None:
        """Prepare data for stratified randomization.
        
        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing geographic units and stratification variables.
        geo_col : str
            The name of the column with unique geographic identifiers.
        strat_vars : list of str
            A list of column names to use for stratification.
        """
        if df is not None:
            self._df = df
        if geo_col is not None:
            self._geo_col = geo_col
        if strat_vars is not None:
            self._strat_vars = strat_vars
            
        # Ensure we have all required data before proceeding
        if self._df is not None and self._geo_col is not None and self._strat_vars is not None:
            # Extract unique geos with stratification variables
            self._geo_df = (
                self._df[[self._geo_col] + self._strat_vars]
                .drop_duplicates(subset=[self._geo_col])
                .reset_index(drop=True)
            )
            
            # Scale stratification variables and create strata using KMeans
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(self._geo_df[self._strat_vars])
            kmeans = KMeans(n_clusters=self.n_strata, random_state=self.seed, n_init=10)
            self._geo_df["stratum"] = kmeans.fit_predict(X_scaled)
            
            # Create stratum mapping for later use
            self._stratum_map = dict(zip(self._geo_df[self._geo_col], self._geo_df["stratum"]))
        else:
            # If we don't have all required data yet, set placeholders to None
            self._geo_df = None
            self._stratum_map = None


    def _design_impl(
        self, df: pd.DataFrame = None, geo_col: str = None, strat_vars: List[str] = None, **kwargs
    ) -> pd.DataFrame:
        """Perform the stratified randomization assignment.

        Returns
        -------
        pd.DataFrame
            The input DataFrame with 'stratum' and 'assignment' columns added.
        """
        # Ensure data was properly prepared
        if self._geo_df is None:
            raise DesignError(
                "Data preparation failed. Ensure prepare_data() was called with valid "
                "df, geo_col, and strat_vars parameters."
            )
        
        if self._stratum_map is None:
            raise DesignError("Stratum mapping was not created during data preparation.")
            
        np.random.seed(self.seed)
        assignments = {}
        
        # Perform random assignment within each stratum
        for stratum_id in range(self.n_strata):
            stratum_geos = self._geo_df[self._geo_df["stratum"] == stratum_id][self._geo_col].tolist()
            # Correctly perform random assignment within each stratum
            n_treatment = len(stratum_geos) // self.num_groups
            treatment_geos = np.random.choice(stratum_geos, n_treatment, replace=False)
            for geo in stratum_geos:
                assignments[geo] = TREATMENT if geo in treatment_geos else CONTROL
        
        # Apply assignments and strata to the original dataframe
        df_assigned = self._df.copy()
        df_assigned["assignment"] = df_assigned[self._geo_col].map(assignments)
        df_assigned["stratum"] = df_assigned[self._geo_col].map(self._stratum_map)
        
        return df_assigned


[docs]
    def design(
        self, df: pd.DataFrame, geo_col: str, strat_vars: List[str]
    ) -> pd.DataFrame:
        """Assigns geographic units to groups using stratified randomization.

        Parameters
        ----------
        df : pd.DataFrame
            DataFrame containing geographic units and stratification variables.
        geo_col : str
            The name of the column with unique geographic identifiers.
        strat_vars : list of str
            A list of column names to use for stratification.

        Returns
        -------
        pd.DataFrame
            The input DataFrame with 'stratum' and 'assignment' columns added.

        Raises
        ------
        ValidationError
            If input validation fails.
        """
        return super().design(df=df, geo_col=geo_col, strat_vars=strat_vars)