#Loading the datset
#Importing pandas library
import pandas as pd
df=pd.read_csv('papers.csv')
#Data stored in variable df
df
id | year | title | event_type | pdf_name | abstract | paper_text | |
---|---|---|---|---|---|---|---|
0 | 1 | 1987 | Self-Organization of Associative Database and ... | NaN | 1-self-organization-of-associative-database-an... | Abstract Missing | 767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA... |
1 | 10 | 1987 | A Mean Field Theory of Layer IV of Visual Cort... | NaN | 10-a-mean-field-theory-of-layer-iv-of-visual-c... | Abstract Missing | 683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU... |
2 | 100 | 1988 | Storing Covariance by the Associative Long-Ter... | NaN | 100-storing-covariance-by-the-associative-long... | Abstract Missing | 394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n... |
3 | 1000 | 1994 | Bayesian Query Construction for Neural Network... | NaN | 1000-bayesian-query-construction-for-neural-ne... | Abstract Missing | Bayesian Query Construction for Neural\nNetwor... |
4 | 1001 | 1994 | Neural Network Ensembles, Cross Validation, an... | NaN | 1001-neural-network-ensembles-cross-validation... | Abstract Missing | Neural Network Ensembles, Cross\nValidation, a... |
... | ... | ... | ... | ... | ... | ... | ... |
7236 | 994 | 1994 | Single Transistor Learning Synapses | NaN | 994-single-transistor-learning-synapses.pdf | Abstract Missing | Single Transistor Learning Synapses\n\nPaul Ha... |
7237 | 996 | 1994 | Bias, Variance and the Combination of Least Sq... | NaN | 996-bias-variance-and-the-combination-of-least... | Abstract Missing | Bias, Variance and the Combination of\nLeast S... |
7238 | 997 | 1994 | A Real Time Clustering CMOS Neural Engine | NaN | 997-a-real-time-clustering-cmos-neural-engine.pdf | Abstract Missing | A Real Time Clustering CMOS\nNeural Engine\nT.... |
7239 | 998 | 1994 | Learning direction in global motion: two class... | NaN | 998-learning-direction-in-global-motion-two-cl... | Abstract Missing | Learning direction in global motion: two\nclas... |
7240 | 999 | 1994 | Correlation and Interpolation Networks for Rea... | NaN | 999-correlation-and-interpolation-networks-for... | Abstract Missing | Correlation and Interpolation Networks for\nRe... |
7241 rows × 7 columns
#Importing the modules from libraries nltk and re
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
[nltk_data] Downloading package stopwords to [nltk_data] /Users/thomas/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /Users/thomas/nltk_data... [nltk_data] Package wordnet is already up-to-date!
#Listing of custom stopwords
stop_words = set(stopwords.words('english'))
new_words = ["fig","figure","image","sample","using","show", "result", "large", "one", "two", "three","four", "five", "seven","eight","nine","also"]
stop_words = list(stop_words.union(new_words))
print(stop_words)
['up', 'didn', "weren't", 'those', 'it', "shouldn't", 'figure', 'ourselves', 'aren', 'there', "couldn't", 'very', 'doesn', 'then', 'whom', 'did', 'here', 'mustn', 'both', "you've", 'they', 'yourself', 'after', 'shan', 'against', "wouldn't", 'she', "don't", 'weren', "you'll", 'themselves', "doesn't", 'mightn', 'you', 'doing', 'can', "shan't", "isn't", 'has', 'or', 'image', 'theirs', 'had', 't', 'above', 'its', 'further', "you'd", "should've", 'ours', 'hadn', 'because', 'own', 'him', 'in', 'such', 'which', 'that', 'am', 'below', 'myself', 'the', 'at', 'he', "she's", 'couldn', 'being', 'result', 'where', 'no', 'will', "that'll", 've', "didn't", 'what', 'them', 'i', 'isn', 'herself', 'into', 'her', 'hers', 'not', 'now', 'does', 'wasn', 'our', "you're", 'hasn', 'these', 'down', 'three', 'why', 'd', 're', 'by', 'how', 'ain', "wasn't", 'again', 'once', "hadn't", 'one', 'is', 'itself', 'having', 'show', 'nine', "needn't", 'this', 'wouldn', 'should', 'll', 'too', 'out', 'over', 'same', 'so', 'while', 'do', 'before', 'only', 'ma', 'through', 'been', 'off', 'his', 'about', "haven't", 'if', 'for', 'using', 'needn', "hasn't", 'any', 'under', 'haven', 'as', 'four', 'some', 'm', 'than', 'their', 'me', 'between', 'sample', 'seven', 'was', 'an', 'just', 'each', 'be', 'o', "mightn't", "won't", 'shouldn', 'with', 'himself', 'were', 'don', 'a', 'from', 'of', 'until', 's', 'few', 'to', 'are', 'other', "mustn't", 'more', 'fig', "aren't", 'on', 'two', 'have', 'five', 'also', 'we', 'yours', 'my', 'but', 'nor', 'y', 'when', 'all', 'and', 'large', "it's", 'who', 'eight', 'won', 'during', 'yourselves', 'your', 'most']
#Defining a function to preprocess the text.
def preprocessor(text):
# converting text into lowercase
text=text.lower()
#removing the tags present in the text
text=re.sub("</?.*?>"," <> ",text)
# removing tje special characters and digits in the text
text=re.sub("(\\d|\\W)+"," ",text)
#Converting to list of words from string
text = text.split()
# removing the stopwords present in the text
text = [word for word in text if word not in stop_words]
# removing the words less than three letters
text = [word for word in text if len(word) >= 3]
#Lemmatize each words using WordNetLemmatizer function
lmtzr = WordNetLemmatizer()
text = [lmtzr.lemmatize(word) for word in text]
return ' '.join(text)
#Applying the preprocessor function to the text
documents = df['paper_text'].apply(lambda x:preprocessor(x))
documents
0 self organization associative database applica... 1 mean field theory layer visual cortex applicat... 2 storing covariance associative long term poten... 3 bayesian query construction neural network mod... 4 neural network ensemble cross validation activ... ... 7236 single transistor learning synapsis paul hasle... 7237 bias variance combination least square estimat... 7238 real time clustering cmos neural engine serran... 7239 learning direction global motion class psychop... 7240 correlation interpolation network real time ex... Name: paper_text, Length: 7241, dtype: object
#Importing countVectorizer from sklearn library
from sklearn.feature_extraction.text import CountVectorizer
#Creating the vocabulary from the words
countvector=CountVectorizer(max_df=0.95,max_features=10000,ngram_range=(1,3))
#Scaling the data
wcvector=countvector.fit_transform(documents)
#Importing TfidTransformer module from sklearn library
#Calculating the reverse frequency of documents.
from sklearn.feature_extraction.text import TfidfTransformer
transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
transformer.fit(wcvector)
TfidfTransformer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
TfidfTransformer()
#Defining function to get the feature names and tf-idf score of top n items
def extractVector(featurenames, sorteditems, top=10):
#using only top items
sorteditems = sorteditems[:top]
scorevals = []
featurevals = []
for idx, score in sorteditems:
fname = featurenames[idx]
#feature name and corresponding score
scorevals.append(round(score, 3))
featurevals.append(featurenames[idx])
#creating a tuple of feature and score
results= {}
for idx in range(len(featurevals)):
results[featurevals[idx]]=scorevals[idx]
return results
#defining a function to return tuples of columns and data
def sortedcoo(matrix):
tuples = zip(matrix.col, matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
# getting the feature name.
feature_names=countvector.get_feature_names_out()
#Defining a function to get keywords.
def getKeywords(id, doc):
#generating the tf_idf for documents
tfidfvector=transformer.transform(countvector.transform([doc[id]]))
#Sorting the vectors in descending order
sorteditems=sortedcoo(tfidfvector.tocoo())
#Extracting the top n values
keywords=extractVector(feature_names,sorteditems,10)
return keywords
#Defining function to print the results.
def printResults(id,keyword, df):
# now print the results
print("\n=====Title=====")
print(df['title'][id])
print("\n=====Abstract=====")
print(df['abstract'][id])
print("\n===Keywords===")
for k in keyword:
print(k,keyword[k])
#Output We are getting from keywords and printResults function.
id=941
keywords=getKeywords(id, documents)
printResults(id,keywords, df)
=====Title===== Algorithms for Non-negative Matrix Factorization =====Abstract===== Non-negative matrix factorization (NMF) has previously been shown to be a useful decomposition for multivariate data. Two different multi- plicative algorithms for NMF are analyzed. They differ only slightly in the multiplicative factor used in the update rules. One algorithm can be shown to minimize the conventional least squares error while the other minimizes the generalized Kullback-Leibler divergence. The monotonic convergence of both algorithms can be proven using an auxiliary func- tion analogous to that used for proving convergence of the Expectation- Maximization algorithm. The algorithms can also be interpreted as diag- onally rescaled gradient descent, where the rescaling factor is optimally chosen to ensure convergence. ===Keywords=== update rule 0.344 update 0.285 auxiliary 0.212 non negative matrix 0.21 negative matrix 0.209 rule 0.192 nmf 0.183 multiplicative 0.175 matrix factorization 0.163 matrix 0.163