Skip to content

Transformers

scikit-learn transformers for the data.

from latent_calendar.datasets import load_online_transactions

df = load_online_transactions()

transformers = create_raw_to_vocab_transformer(id_col="Customer ID", timestamp_col="InvoiceDate")

df_wide = transformers.fit_transform(df)

CalandarTimestampFeatures

Bases: BaseEstimator, TransformerMixin

Day of week and prop into day columns creation.

Source code in latent_calendar/transformers.py
class CalandarTimestampFeatures(BaseEstimator, TransformerMixin):
    """Day of week and prop into day columns creation."""

    def __init__(
        self,
        timestamp_col: str,
    ) -> None:
        self.timestamp_col = timestamp_col

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Create 2 new columns."""
        if not hasattr(X[self.timestamp_col], "dt"):
            raise RuntimeError(
                f"Column {self.timestamp_col!r} is not a datetime column. Use df[{self.timestamp_col!r}] = pd.to_datetime(df[{self.timestamp_col!r}]) first."
            )

        X = X.copy()

        X["prop_into_day_start"] = prop_into_day(X[self.timestamp_col].dt)
        X["day_of_week"] = X[self.timestamp_col].dt.dayofweek

        X["hour"] = X["prop_into_day_start"] * 24

        tmp_columns = ["prop_into_day_start"]
        self.created_columns = ["day_of_week", "hour"]

        X = X.drop(columns=tmp_columns)
        self.columns = list(X.columns)

        return X

    def get_feature_names_out(self, input_features=None):
        return self.columns.extend(self.created_columns)

transform(X, y=None)

Create 2 new columns.

Source code in latent_calendar/transformers.py
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Create 2 new columns."""
    if not hasattr(X[self.timestamp_col], "dt"):
        raise RuntimeError(
            f"Column {self.timestamp_col!r} is not a datetime column. Use df[{self.timestamp_col!r}] = pd.to_datetime(df[{self.timestamp_col!r}]) first."
        )

    X = X.copy()

    X["prop_into_day_start"] = prop_into_day(X[self.timestamp_col].dt)
    X["day_of_week"] = X[self.timestamp_col].dt.dayofweek

    X["hour"] = X["prop_into_day_start"] * 24

    tmp_columns = ["prop_into_day_start"]
    self.created_columns = ["day_of_week", "hour"]

    X = X.drop(columns=tmp_columns)
    self.columns = list(X.columns)

    return X

HourDiscretizer

Bases: BaseEstimator, TransformerMixin

Discretize the hour column.

Parameters:

Name Type Description Default
col str

The name of the column to discretize.

'hour'
minutes int

The number of minutes to discretize by.

60
Source code in latent_calendar/transformers.py
class HourDiscretizer(BaseEstimator, TransformerMixin):
    """Discretize the hour column.

    Args:
        col: The name of the column to discretize.
        minutes: The number of minutes to discretize by.

    """

    def __init__(self, col: str = "hour", minutes: int = 60) -> None:
        self.col = col
        self.minutes = minutes

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        divisor = 1 if self.minutes == 60 else self.minutes / 60
        X[self.col] = (X[self.col] // divisor) * divisor

        if self.minutes % 60 == 0:
            X[self.col] = X[self.col].astype(int)

        self.columns = list(X.columns)

        return X

    def get_feature_names_out(self, input_features=None):
        return self.columns

LongToWide

Bases: BaseEstimator, TransformerMixin

Unstack the assumed last index as vocab column.

Parameters:

Name Type Description Default
col str

The name of the column to unstack.

'num_events'
as_int bool

Whether to cast the values to int.

True
minutes int

The number of minutes to discretize by.

60
multiindex bool

Whether the columns are a multiindex.

True
Source code in latent_calendar/transformers.py
class LongToWide(BaseEstimator, TransformerMixin):
    """Unstack the assumed last index as vocab column.

    Args:
        col: The name of the column to unstack.
        as_int: Whether to cast the values to int.
        minutes: The number of minutes to discretize by.
        multiindex: Whether the columns are a multiindex.

    """

    def __init__(
        self,
        col: str = "num_events",
        as_int: bool = True,
        minutes: int = 60,
        multiindex: bool = True,
    ) -> None:
        self.col = col
        self.as_int = as_int
        self.minutes = minutes
        self.multiindex = multiindex

    def fit(self, X: pd.DataFrame, y=None):
        return self

    @property
    def columns(self) -> list[str]:
        return create_full_vocab(
            days_in_week=DAYS_IN_WEEK,
            minutes=self.minutes,
            as_multiindex=self.multiindex,
        )

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Unstack the assumed last index as vocab column."""
        X_res = X.loc[:, self.col]

        level = [-2, -1] if self.multiindex else -1
        X_res = X_res.unstack(level=level)

        X_res = X_res.reindex(self.columns, axis=1)
        X_res = X_res.fillna(value=0)
        if self.as_int:
            X_res = X_res.astype(int)

        return X_res

    def get_feature_names_out(self, input_features=None):
        return self.columns

transform(X, y=None)

Unstack the assumed last index as vocab column.

Source code in latent_calendar/transformers.py
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Unstack the assumed last index as vocab column."""
    X_res = X.loc[:, self.col]

    level = [-2, -1] if self.multiindex else -1
    X_res = X_res.unstack(level=level)

    X_res = X_res.reindex(self.columns, axis=1)
    X_res = X_res.fillna(value=0)
    if self.as_int:
        X_res = X_res.astype(int)

    return X_res

RawToVocab

Bases: BaseEstimator, TransformerMixin

Transformer timestamp level data into id level data with vocab columns.

Parameters:

Name Type Description Default
id_col str

The name of the id column.

required
timestamp_col str

The name of the timestamp column.

required
minutes int

The number of minutes to discretize by.

60
additional_groups list[str] | None

Additional columns to group by.

None
cols list[str] | None

Additional columns to sum.

None
as_multiindex bool

Whether to return columns as a multiindex.

True
Source code in latent_calendar/transformers.py
class RawToVocab(BaseEstimator, TransformerMixin):
    """Transformer timestamp level data into id level data with vocab columns.

    Args:
        id_col: The name of the id column.
        timestamp_col: The name of the timestamp column.
        minutes: The number of minutes to discretize by.
        additional_groups: Additional columns to group by.
        cols: Additional columns to sum.
        as_multiindex: Whether to return columns as a multiindex.

    """

    def __init__(
        self,
        id_col: str,
        timestamp_col: str,
        minutes: int = 60,
        additional_groups: list[str] | None = None,
        cols: list[str] | None = None,
        as_multiindex: bool = True,
    ) -> None:
        self.id_col = id_col
        self.timestamp_col = timestamp_col
        self.minutes = minutes
        self.additional_groups = additional_groups
        self.cols = cols
        self.as_multiindex = as_multiindex

    def fit(self, X: pd.DataFrame, y=None):
        # New features at same index level
        self.features = create_timestamp_feature_pipeline(
            self.timestamp_col,
            minutes=self.minutes,
            create_vocab=not self.as_multiindex,
        )

        groups = [self.id_col]
        if self.additional_groups is not None:
            if not isinstance(self.additional_groups, list):
                raise ValueError(
                    f"additional_groups should be list not {type(self.additional_groups)}"
                )

            groups.extend(self.additional_groups)

        if self.as_multiindex:
            groups.extend(["day_of_week", "hour"])
        else:
            groups.append("vocab")

        # Reaggregation
        self.aggregation = VocabAggregation(groups=groups, cols=self.cols)
        # Unstacking
        self.widden = LongToWide(
            col="num_events", minutes=self.minutes, multiindex=self.as_multiindex
        )
        # Since nothing needs to be "fit"
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X_trans = self.features.transform(X)

        X_agg = self.aggregation.transform(X_trans)
        return self.widden.transform(X_agg)

VocabAggregation

Bases: BaseEstimator, TransformerMixin

NOTE: The index of the grouping stays.

Parameters:

Name Type Description Default
groups list[str]

The columns to group by.

required
cols list[str] | None

Additional columns to sum.

None
Source code in latent_calendar/transformers.py
class VocabAggregation(BaseEstimator, TransformerMixin):
    """NOTE: The index of the grouping stays.

    Args:
        groups: The columns to group by.
        cols: Additional columns to sum.

    """

    def __init__(self, groups: list[str], cols: list[str] | None = None) -> None:
        self.groups = groups
        self.cols = cols

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        stats = {}
        if self.cols is not None:
            stats.update({col: (col, "sum") for col in self.cols})

        df_agg = (
            X.assign(num_events=1)
            .groupby(self.groups)
            .agg(num_events=("num_events", "sum"), **stats)
        )
        self.columns = list(df_agg.columns)

        return df_agg

    def get_feature_names_out(self, input_features=None):
        return self.columns

VocabTransformer

Bases: BaseEstimator, TransformerMixin

Create a vocab column from the day of week and hour columns.

Source code in latent_calendar/transformers.py
class VocabTransformer(BaseEstimator, TransformerMixin):
    """Create a vocab column from the day of week and hour columns."""

    def __init__(
        self, day_of_week_col: str = "day_of_week", hour_col: str = "hour"
    ) -> None:
        self.day_of_week_col = day_of_week_col
        self.hour_col = hour_col

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        X["vocab"] = (
            X[self.day_of_week_col]
            .astype(str)
            .str.zfill(2)
            .str.cat(X[self.hour_col].astype(str).str.zfill(2), sep=" ")
        )

        self.columns = list(X.columns)

        return X

    def get_feature_names_out(self, input_features=None):
        return self.columns

create_raw_to_vocab_transformer(id_col, timestamp_col, minutes=60, additional_groups=None, as_multiindex=True)

Wrapper to create the transformer from the configuration options.

Parameters:

Name Type Description Default
id_col str

The name of the id column.

required
timestamp_col str

The name of the timestamp column.

required
minutes int

The number of minutes to discretize by.

60
additional_groups list[str] | None

Additional columns to group by.

None
as_multiindex bool

Whether to return columns as a multiindex.

True

Returns:

Type Description
RawToVocab

A transformer that transforms timestamp level data into id level data with vocab columns.

Source code in latent_calendar/transformers.py
def create_raw_to_vocab_transformer(
    id_col: str,
    timestamp_col: str,
    minutes: int = 60,
    additional_groups: list[str] | None = None,
    as_multiindex: bool = True,
) -> RawToVocab:
    """Wrapper to create the transformer from the configuration options.

    Args:
        id_col: The name of the id column.
        timestamp_col: The name of the timestamp column.
        minutes: The number of minutes to discretize by.
        additional_groups: Additional columns to group by.
        as_multiindex: Whether to return columns as a multiindex.

    Returns:
        A transformer that transforms timestamp level data into id level data with vocab columns.

    """
    if not as_multiindex:
        msg = (
            "columns will be returned as a MultiIndex by default and will "
            "be behavior in future. Use as_multiindex=False for previous behavior"
        )
        warnings.warn(msg, DeprecationWarning, stacklevel=2)

    return RawToVocab(
        id_col=id_col,
        timestamp_col=timestamp_col,
        minutes=minutes,
        additional_groups=additional_groups,
    )

create_timestamp_feature_pipeline(timestamp_col, discretize=True, minutes=60, create_vocab=True)

Create a pipeline that creates features from the timestamp column.

Parameters:

Name Type Description Default
timestamp_col str

The name of the timestamp column.

required
discretize bool

Whether to discretize the hour column.

True
minutes int

The number of minutes to discretize by. Ignored if discretize is False.

60
create_vocab bool

Whether to create the vocab column.

True

Returns:

Type Description
Pipeline

A pipeline that creates features from the timestamp column.

Example

Create features for the online transactions dataset.

from latent_calendar.datasets import load_online_transactions

df = load_online_transactions()

transformers = create_timestamp_feature_pipeline(timestamp_col="InvoiceDate")

df_features = transformers.fit_transform(df)
Source code in latent_calendar/transformers.py
def create_timestamp_feature_pipeline(
    timestamp_col: str,
    discretize: bool = True,
    minutes: int = 60,
    create_vocab: bool = True,
) -> Pipeline:
    """Create a pipeline that creates features from the timestamp column.

    Args:
        timestamp_col: The name of the timestamp column.
        discretize: Whether to discretize the hour column.
        minutes: The number of minutes to discretize by. Ignored if discretize is False.
        create_vocab: Whether to create the vocab column.

    Returns:
        A pipeline that creates features from the timestamp column.

    Example:
        Create features for the online transactions dataset.

        ```python
        from latent_calendar.datasets import load_online_transactions

        df = load_online_transactions()

        transformers = create_timestamp_feature_pipeline(timestamp_col="InvoiceDate")

        df_features = transformers.fit_transform(df)
        ```

    """
    if create_vocab and not discretize:
        raise ValueError("Cannot create vocab without discretizing.")

    vocab_col = "hour"
    transformers = [
        (
            "timestamp_features",
            CalandarTimestampFeatures(timestamp_col=timestamp_col),
        ),
    ]

    if discretize:
        transformers.append(
            ("binning", HourDiscretizer(col=vocab_col, minutes=minutes))
        )

    if create_vocab:
        transformers.append(
            ("vocab_creation", VocabTransformer(hour_col=vocab_col)),
        )

    return Pipeline(
        transformers,
    ).set_output(transform="pandas")

prop_into_day(dt)

Returns the proportion into the day from datetime like object.

0.0 is midnight and 1.0 is midnight again.

Parameters:

Name Type Description Default
dt datetime | DatetimeProperties

datetime like object

required

Returns:

Type Description
float | Series

numeric value(s) between 0.0 and 1.0

Source code in latent_calendar/transformers.py
def prop_into_day(dt: datetime | DatetimeProperties) -> float | pd.Series:
    """Returns the proportion into the day from datetime like object.

    0.0 is midnight and 1.0 is midnight again.

    Args:
        dt: datetime like object

    Returns:
        numeric value(s) between 0.0 and 1.0

    """
    prop_hour = dt.hour / HOURS_IN_DAY
    prop_minute = dt.minute / MINUTES_IN_DAY
    prop_second = dt.second / SECONDS_IN_DAY
    prop_microsecond = dt.microsecond / MICROSECONDS_IN_DAY

    return prop_hour + prop_minute + prop_second + prop_microsecond

Comments