# Visualize sampling distribution
df_sample = df.sample(n=10)
df_sample["col"].hist(bins=np.arange(59, 93, 2))
plt.show()
# Sampling with replacement (for dependent event)
df["col"].sample(5, replace = True)
# Sampling without replacement (for independent event)
df["col"].sample(5, replace = False)
# Simple random sampling
simple_sample = df.sample(n=5, random_state=42)
# Systematic sampling
sample_size = 5
pop_size = len(df)
interval = pop_size
shuffled_df = df.sample(frac=1)
shuffled_df = shuffled_df.reset_index(drop=True).reset_index()
systematic_sample = shuffled_df.iloc[::interval]
# Stratified sampling
prop_stratified_sample = df.groupby("cat_col").sample(frac=0.1, random_state=42)
equal_stratified_sample = df.groupby("cat_col").sample(n=15, random_state=42)
# Weighted sampling
condition = df['cat_col'] == "Val"
df['weight'] = np.where(condition, 2, 1)
weighted_sample = df.sample(frac=0.1, weights="weight")
# Cluster sampling
category_list = list(df['cat_col'].unique())
import random
random_categories = random.sample(category_list, k=3)
subset_rows = df['cat_col'].isin(random_categories)
subset_df = df[subset_rows]
subset_df['cat_col'] = subset_df['cat_col'].cat.remove_unused_categories()
sample_cluster = subset_df.groupby("cat_col").sample(n=5, random_state=42)
# Visualize to make sure white noise so that sampling is random
sample_df.plot(x="col1", y="col2", kind="scatter")
plt.show()