import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TfidfVectorizer: tfidf
# maximum and minimum fraction a word should occur in is 20% to 80%, keep top 50 termms
tfidf = TfidfVectorizer(max_df=0.8, min_df=0.2, max_features=50, stop_words=stop_words, ngram_range = (1,1)) # ngram = sequence of words
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)
# Print result of toarray() method
print(csr_mat.toarray())
# Get the words: words
words = tfidf.get_feature_names() # tfidf.get_feature_names_out()
# Print words
print(words)
# Create a dataframe from this sparse matrix representation
df = pd.DataFrame(data=csr_mat.toarray(), columns=words)
# From dataframe to sparse dataframe
sparse_df = some_df.sparse.to_coo()
# From sparse to dense dataframe
dense_df = sparse_df.to_dense()
# see the words and weights in the model
tfidf.vocabulary_ # words as keys and location of the words in the text as value
tfidf[3].indices # word index no from vocabulary that exist on the 4th row
tfidf[3].data # weight of words on the 4th row
# Create a dictionary for each row of data where location of the word in text is key and value is weights
def return_weights(vocab, vector, index):
zipped = dict(zip(vector[index].indices, vector[index].data))
return {vocab[i]:zipped[i] for i in vector[index].indices}
print(return_weights(tfidf.vocabulary_, tfidf, 3))
# Clustering on tf-idf
num_clusters = 2 # You can adjust this as needed
cluster_centers, distortion = kmeans(csr_mat.todense(), num_clusters)
df['cluster_labels'], _ = vq(csr_mat.todense(), cluster_centers)
# Display top terms for each cluster
for i in range(num_clusters):
cluster_df = df[df['cluster_labels'] == i]
top_terms = cluster_df.mean().sort_values(ascending=False).head().index
print(top_terms)