#Loading the datset
#Importing pandas library
import pandas as pd 
df=pd.read_csv('papers.csv')


#Data stored in variable df
df


#Importing the modules from libraries nltk and re
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/thomas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#Listing of custom stopwords
stop_words = set(stopwords.words('english'))
new_words = ["fig","figure","image","sample","using","show", "result", "large", "one", "two", "three","four", "five", "seven","eight","nine","also"]
stop_words = list(stop_words.union(new_words))


print(stop_words)

['up', 'didn', "weren't", 'those', 'it', "shouldn't", 'figure', 'ourselves', 'aren', 'there', "couldn't", 'very', 'doesn', 'then', 'whom', 'did', 'here', 'mustn', 'both', "you've", 'they', 'yourself', 'after', 'shan', 'against', "wouldn't", 'she', "don't", 'weren', "you'll", 'themselves', "doesn't", 'mightn', 'you', 'doing', 'can', "shan't", "isn't", 'has', 'or', 'image', 'theirs', 'had', 't', 'above', 'its', 'further', "you'd", "should've", 'ours', 'hadn', 'because', 'own', 'him', 'in', 'such', 'which', 'that', 'am', 'below', 'myself', 'the', 'at', 'he', "she's", 'couldn', 'being', 'result', 'where', 'no', 'will', "that'll", 've', "didn't", 'what', 'them', 'i', 'isn', 'herself', 'into', 'her', 'hers', 'not', 'now', 'does', 'wasn', 'our', "you're", 'hasn', 'these', 'down', 'three', 'why', 'd', 're', 'by', 'how', 'ain', "wasn't", 'again', 'once', "hadn't", 'one', 'is', 'itself', 'having', 'show', 'nine', "needn't", 'this', 'wouldn', 'should', 'll', 'too', 'out', 'over', 'same', 'so', 'while', 'do', 'before', 'only', 'ma', 'through', 'been', 'off', 'his', 'about', "haven't", 'if', 'for', 'using', 'needn', "hasn't", 'any', 'under', 'haven', 'as', 'four', 'some', 'm', 'than', 'their', 'me', 'between', 'sample', 'seven', 'was', 'an', 'just', 'each', 'be', 'o', "mightn't", "won't", 'shouldn', 'with', 'himself', 'were', 'don', 'a', 'from', 'of', 'until', 's', 'few', 'to', 'are', 'other', "mustn't", 'more', 'fig', "aren't", 'on', 'two', 'have', 'five', 'also', 'we', 'yours', 'my', 'but', 'nor', 'y', 'when', 'all', 'and', 'large', "it's", 'who', 'eight', 'won', 'during', 'yourselves', 'your', 'most']


#Defining a function to preprocess the text.
def preprocessor(text):
    # converting text into lowercase
    text=text.lower()
    #removing the tags present in the text
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    # removing tje special characters and digits in the text
    text=re.sub("(\\d|\\W)+"," ",text) 
    #Converting to list of words from string
    text = text.split()
    # removing the stopwords present in the text
    text = [word for word in text if word not in stop_words]
    # removing the words less than three letters
    text = [word for word in text if len(word) >= 3]
    #Lemmatize each words using WordNetLemmatizer function
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    return ' '.join(text)


#Applying the preprocessor function to the text
documents = df['paper_text'].apply(lambda x:preprocessor(x))
documents

0       self organization associative database applica...
1       mean field theory layer visual cortex applicat...
2       storing covariance associative long term poten...
3       bayesian query construction neural network mod...
4       neural network ensemble cross validation activ...
                              ...                        
7236    single transistor learning synapsis paul hasle...
7237    bias variance combination least square estimat...
7238    real time clustering cmos neural engine serran...
7239    learning direction global motion class psychop...
7240    correlation interpolation network real time ex...
Name: paper_text, Length: 7241, dtype: object


#Importing countVectorizer from sklearn library
from sklearn.feature_extraction.text import CountVectorizer
#Creating the vocabulary from the words
countvector=CountVectorizer(max_df=0.95,max_features=10000,ngram_range=(1,3))


#Scaling the data
wcvector=countvector.fit_transform(documents)


#Importing TfidTransformer module from sklearn library
#Calculating the reverse frequency of documents.
from sklearn.feature_extraction.text import TfidfTransformer
transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
transformer.fit(wcvector)

TfidfTransformer()

TfidfTransformer()


#Defining function to get the feature names and tf-idf score of top n items
def extractVector(featurenames, sorteditems, top=10):
    #using only top items 
    sorteditems = sorteditems[:top]
    scorevals = []
    featurevals = []
    for idx, score in sorteditems:
        fname = featurenames[idx]
        #feature name and corresponding score
        scorevals.append(round(score, 3))
        featurevals.append(featurenames[idx])

    #creating a tuple of feature and score
    results= {}
    for idx in range(len(featurevals)):
        results[featurevals[idx]]=scorevals[idx]  
    return results


#defining a function to return tuples of columns and data
def sortedcoo(matrix):
    tuples = zip(matrix.col, matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# getting the feature name.
feature_names=countvector.get_feature_names_out()


#Defining a function to get keywords.
def getKeywords(id, doc):
    #generating the tf_idf for documents
    tfidfvector=transformer.transform(countvector.transform([doc[id]]))
    #Sorting the vectors in descending order
    sorteditems=sortedcoo(tfidfvector.tocoo())
    #Extracting the top n values
    keywords=extractVector(feature_names,sorteditems,10)
    return keywords


#Defining function to print the results.
def printResults(id,keyword, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][id])
    print("\n=====Abstract=====")
    print(df['abstract'][id])
    print("\n===Keywords===")
    for k in keyword:
        print(k,keyword[k])


#Output We are getting from keywords and printResults function.
id=941
keywords=getKeywords(id, documents)
printResults(id,keywords, df)

=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
update rule 0.344
update 0.285
auxiliary 0.212
non negative matrix 0.21
negative matrix 0.209
rule 0.192
nmf 0.183
multiplicative 0.175
matrix factorization 0.163
matrix 0.163

	id	year	title	event_type	pdf_name	abstract	paper_text
0	1	1987	Self-Organization of Associative Database and ...	NaN	1-self-organization-of-associative-database-an...	Abstract Missing	767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1	10	1987	A Mean Field Theory of Layer IV of Visual Cort...	NaN	10-a-mean-field-theory-of-layer-iv-of-visual-c...	Abstract Missing	683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2	100	1988	Storing Covariance by the Associative Long-Ter...	NaN	100-storing-covariance-by-the-associative-long...	Abstract Missing	394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3	1000	1994	Bayesian Query Construction for Neural Network...	NaN	1000-bayesian-query-construction-for-neural-ne...	Abstract Missing	Bayesian Query Construction for Neural\nNetwor...
4	1001	1994	Neural Network Ensembles, Cross Validation, an...	NaN	1001-neural-network-ensembles-cross-validation...	Abstract Missing	Neural Network Ensembles, Cross\nValidation, a...
...	...	...	...	...	...	...	...
7236	994	1994	Single Transistor Learning Synapses	NaN	994-single-transistor-learning-synapses.pdf	Abstract Missing	Single Transistor Learning Synapses\n\nPaul Ha...
7237	996	1994	Bias, Variance and the Combination of Least Sq...	NaN	996-bias-variance-and-the-combination-of-least...	Abstract Missing	Bias, Variance and the Combination of\nLeast S...
7238	997	1994	A Real Time Clustering CMOS Neural Engine	NaN	997-a-real-time-clustering-cmos-neural-engine.pdf	Abstract Missing	A Real Time Clustering CMOS\nNeural Engine\nT....
7239	998	1994	Learning direction in global motion: two class...	NaN	998-learning-direction-in-global-motion-two-cl...	Abstract Missing	Learning direction in global motion: two\nclas...
7240	999	1994	Correlation and Interpolation Networks for Rea...	NaN	999-correlation-and-interpolation-networks-for...	Abstract Missing	Correlation and Interpolation Networks for\nRe...

Keyword Extraction with Python¶