import pandas as pd
import sklearn as sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
#import numpy as np
#import scipy.sparse.csr.csr_matrix as sparse_matrix
sklearn.__version__
'1.1.1'
Encode the labeled array (corpus of documents) as a vector.
vectorizer = TfidfVectorizer()
data = ["this is a dog",
"this is a cat",
"this cat chases this dog"]
#"the dog chased the cat"]
# get the Tf-idf-weighted document-term matrix: a numerical
# representation
# of the corpus based on the Tf-idf values for each of the terms in
# each document.
X = vectorizer.fit_transform(data)
vocabulary_ : mapping of terms in the text to the column in the feature matrix
vectorizer.vocabulary_
{'this': 4, 'is': 3, 'dog': 2, 'cat': 0, 'chases': 1}
idf_ : inverse document frequency vector
vectorizer.idf_
array([1.28768207, 1.69314718, 1.28768207, 1.28768207, 1. ])
stop words: terms that were ignored because the are or are not in too many documents or they were cut off by max_features
vectorizer.stop_words_
set()
The output is a sparce matrix.
# we have a numpy scipy array
print(type(X))
<class 'scipy.sparse.csr.csr_matrix'>
The array has encodes 5 non-trivial unique words and wind up with 5 columns. 'a' is trivial. I do not know why it does not appear as a stop word.
X_full = X.toarray()
X_df = pd.DataFrame(X_full)
X_df
# looks like 0: cat, 1: chases, 2: dog, 3: is, 4: this
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.619805 | 0.619805 | 0.481334 |
1 | 0.619805 | 0.000000 | 0.000000 | 0.619805 | 0.481334 |
2 | 0.403525 | 0.530587 | 0.403525 | 0.000000 | 0.626747 |
The sum of each column is a tf_id for a word.
tf_idfs = X_df.sum()
tf_idfs
0 1.023331 1 0.530587 2 1.023331 3 1.239611 4 1.589415 dtype: float64
Get the words in column order.
names = vectorizer.get_feature_names_out()
names
array(['cat', 'chases', 'dog', 'is', 'this'], dtype=object)
Make a list of tuples: name, tf_ifd.
feature_tf_ifs = zip(names, tf_idfs)
print(list(feature_tf_ifs))
[('cat', 1.0233307350085785), ('chases', 0.5305873490316616), ('dog', 1.0233307350085785), ('is', 1.2396107598812145), ('this', 1.5894152087714162)]
Now do it for seven non-trivial words across four documents
vectorizer = TfidfVectorizer()
data = ["this is a dog",
"this is a cat",
"this cat chases this dog",
"the dog chased the cat"]
# get the Tf-idf-weighted document-term matrix: a numerical
# representation
# of the corpus based on the Tf-idf values for each of the terms in
# each document.
X = vectorizer.fit_transform(data)
X_full = X.toarray()
X_df = pd.DataFrame(X_full)
X_df
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.000000 | 0.000000 | 0.532570 | 0.657829 | 0.000000 | 0.532570 |
1 | 0.532570 | 0.000000 | 0.000000 | 0.000000 | 0.657829 | 0.000000 | 0.532570 |
2 | 0.343918 | 0.000000 | 0.538815 | 0.343918 | 0.000000 | 0.000000 | 0.687837 |
3 | 0.264696 | 0.414698 | 0.000000 | 0.264696 | 0.000000 | 0.829396 | 0.000000 |
Note that we have four rows and seven non-trivial words.
tf_idfs = X_df.sum()
print(tf_idfs)
names = vectorizer.get_feature_names_out()
print(names)
feature_tf_ifs = zip(names, tf_idfs)
list(feature_tf_ifs)
0 1.141184 1 0.414698 2 0.538815 3 1.141184 4 1.315659 5 0.829396 6 1.752976 dtype: float64 ['cat' 'chased' 'chases' 'dog' 'is' 'the' 'this']
[('cat', 1.1411841517242665), ('chased', 0.4146978997095072), ('chases', 0.5388145234551656), ('dog', 1.1411841517242665), ('is', 1.3156586265997054), ('the', 0.8293957994190144), ('this', 1.7529757083310384)]
Now say you want to fit and transform some training data for a classifier.
The classifier will expect the test data, X_test, to have the same dimenstions as the original data, X_train.
So, given X_train:
training_data = ["this is a dog",
"this is a cat",
"this cat chases this dog"]
And given some test data: The test data needs to have the same number of documents as the training data. The words will be different but any document-term matrix needs to have the same number of features.
test_data = ["the dog chased the cat",
"the elephant chased the mouse",
"this osprey still ate the fish then another"]
Lets look at some resulting document-term matrices made from training and test data.
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()
X_doc_term_train = vectorizer1.fit_transform(training_data)
print(X_doc_term_train.shape)
print(pd.DataFrame(X_doc_term_train.toarray()))
print(vectorizer1.inverse_transform(X_doc_term_train))
(3, 5) 0 1 2 3 4 0 0.000000 0.000000 0.619805 0.619805 0.481334 1 0.619805 0.000000 0.000000 0.619805 0.481334 2 0.403525 0.530587 0.403525 0.000000 0.626747 [array(['dog', 'is', 'this'], dtype='<U6'), array(['cat', 'is', 'this'], dtype='<U6'), array(['chases', 'cat', 'dog', 'this'], dtype='<U6')]
X_doc_term_test = vectorizer2.fit_transform(test_data)
print(X_doc_term_test.shape)
(3, 13)
Now, let's see how that would work with a classifier. Let's train one on doc_term_train.
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
y_train = [0,1,0]
fitted_clr = classifier.fit(X_doc_term_train,y_train)
Now we want to make a prediction based on test_data. However if we call classifier.predict(X_doc_term_test), the code will throw an erro because X_doc_term_train has 5 features and X_doc_term_test has 10 features.
X_doc_term_test = vectorizer1.transform(test_data)
print(X_doc_term_test.shape)
X_test_2 = vectorizer1.inverse_transform(X_doc_term_test)
print(pd.DataFrame(X_test_2))
y_predicted = classifier.predict(X_doc_term_test)
(3, 5) 0 1 0 dog cat 1 None None 2 this None