Skip to content

Generate

Generate some fake data for various purposes.

sample_from_lda(components_prior, components_time_slots_prior, n_samples, random_state=None)

Sample from LDA model.

Parameters:

Name Type Description Default
components_prior ndarray | TensorVariable

prior probability of each component (n_components, )

required
components_time_slots_prior ndarray | TensorVariable

prior for time slots (n_components, n_time_slots)

required
n_samples ndarray

number of samples for each user (n_user, )

required
random_state int | None

random state for sampling

None

Returns:

Type Description
tuple[DataFrame, DataFrame]

probability DataFrame (n_user, n_components) and event count DataFrame with (n_user, n_time_slots) with each row summing up to n

Source code in latent_calendar/generate.py
def sample_from_lda(
    components_prior: np.ndarray | TensorVariable,
    components_time_slots_prior: np.ndarray | TensorVariable,
    n_samples: np.ndarray,
    random_state: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Sample from LDA model.

    Args:
        components_prior: prior probability of each component (n_components, )
        components_time_slots_prior: prior for time slots (n_components, n_time_slots)
        n_samples: number of samples for each user (n_user, )
        random_state: random state for sampling

    Returns:
        probability DataFrame (n_user, n_components) and event count DataFrame with (n_user, n_time_slots) with each row summing up to `n`

    """
    rng = np.random.default_rng(random_state)

    user_travel_style_data = []
    user_time_slot_data = []

    travel_style = pm.Dirichlet.dist(components_prior)
    time_slot_styles = pm.Dirichlet.dist(components_time_slots_prior)

    for n in n_samples:
        _, user_time_slots = define_single_user_samples(
            travel_style, time_slot_styles, n_samples=int(n)
        )

        user_travel_style_samples, user_time_slot_samples = pm.draw(
            [travel_style, user_time_slots.sum(axis=0)], draws=1, random_seed=rng
        )

        user_travel_style_data.append(user_travel_style_samples)
        user_time_slot_data.append(user_time_slot_samples)

    df_user_travel_style = pd.DataFrame(user_travel_style_data)
    df_user_time_slots = pd.DataFrame(user_time_slot_data)

    return df_user_travel_style, df_user_time_slots

wide_format_dataframe(n_rows, rate=1.0, random_state=None)

Generate some data from Poisson distribution.

Parameters:

Name Type Description Default
n_rows int

number of rows to generate

required
rate float

rate parameter for Poisson distribution

1.0
random_state int | None

random state for reproducibility

None

Returns:

Type Description
DataFrame

DataFrame with columns from FULL_VOCAB and n_rows rows

Source code in latent_calendar/generate.py
def wide_format_dataframe(
    n_rows: int,
    rate: float = 1.0,
    random_state: int | None = None,
) -> pd.DataFrame:
    """Generate some data from Poisson distribution.

    Args:
        n_rows: number of rows to generate
        rate: rate parameter for Poisson distribution
        random_state: random state for reproducibility

    Returns:
        DataFrame with columns from FULL_VOCAB and n_rows rows

    """
    if random_state is not None:
        np.random.seed(random_state)

    data = np.random.poisson(lam=rate, size=(n_rows, len(FULL_VOCAB)))

    return pd.DataFrame(data, columns=FULL_VOCAB)

Comments