from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
std_df = scaler.fit_transform(df)
pca = PCA()
print(pca.fit_transform(std_df))
# See which principal components explains the variance more
print(pca.explained_variance_ratio_)
# See equation of principal components : PC 1 = 0.71 x Feature 1 + 0.71 x Feature 2
print(pca.components_)
### Alternative approach: Pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()),
('reducer', PCA(n_components=3)), # n_components=0.9 for capturing 90% variance
('classifier', RandomForestClassifier())])
# principal components
pipe['reducer'].components_
# No of principal components
pipe['reducer'].n_components_
# See PCA explained variance
pipe['reducer'].explained_variance_ratio_
# Visualize elbow plot for PCA tuning
plt.plot(pipe['reducer'].explained_variance_ratio_)
# Visualize PCA plot
pc = pipe['reducer'].fit_transform(df)
df['PC 1'] = pc[:,0]
df['PC 2'] = pc[:,1]
sns.scatterplot(data=df, x='PC 1', y='PC 2', hue='cat_col', alpha=0.4)
# Rebuilding back to original Data
pc = pipe['reducer'].transform(X)
X_rebuilt = pipe['reducer'].inverse_transform(pc)