import nltk import math import string from nltk.corpus import stopwords from collections import Counter from nltk.stem.porter import * from sklearn.feature_extraction.text import TfidfVectorizer text1 = "Python is a 2000 made-for-TV horror movie directed by Richard \ Clabaugh. The film features several cult favorite actors, including William \ Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, \ Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the \ A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean \ Whalen. The film concerns a genetically engineered snake, a python, that \ escapes and unleashes itself on a small town. It includes the classic final\ girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, \ California and Malibu, California. Python was followed by two sequels: Python \ II (2002) and Boa vs. Python (2004), both also made-for-TV films." text2 = "Python, from the Greek word (πύθων/πύθωνας), is a genus of \ nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are \ recognised.[2] A member of this genus, P. reticulatus, is among the longest \ snakes known." text3 = "The Colt Python is a .357 Magnum caliber revolver formerly \ manufactured by Colt's Manufacturing Company of Hartford, Connecticut. \ It is sometimes referred to as a \"Combat Magnum\".[1] It was first introduced \ in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued \ Colt Python targeted the premium revolver market segment. Some firearm \ collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy \ Thompson, Renee Smeets and Martin Dougherty have described the Python as the \ finest production revolver ever made."
def get_tokens(text): lowers = text.lower() #remove the punctuation using the character deletion step of translate remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) no_punctuation = lowers.translate(remove_punctuation_map) tokens = nltk.word_tokenize(no_punctuation) return tokens
tokens = get_tokens(text1) count = Counter(tokens) print (count.most_common(10))
[('the', 6), ('python', 5), ('a', 5), ('and', 4), ('films', 3), ('in', 3),
('madefortv', 2), ('on', 2), ('by', 2), ('was', 2)]
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
tokens = get_tokens(text1)
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print (count.most_common(10))
[('python', 5), ('films', 3), ('film', 2), ('california', 2), ('madefortv', 2),
('genetically', 1), ('horror', 1), ('krueger', 1), ('filmed', 1), ('sean', 1)]
tokens = get_tokens(text1)
filtered = [w for w in tokens if not w in stopwords.words('english')]
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed) print(count)
Counter({'film': 6, 'python': 5, 'madefortv': 2, 'california': 2, 'includ': 2, '2004': 1,
'role': 1, 'casper': 1, 'robert': 1, 'sequel': 1, 'two': 1, 'krueger': 1,
'ii': 1, 'sean': 1, 'lo': 1, 'clabaugh': 1, 'finalgirl': 1, 'wheaton': 1,
'concern': 1, 'whalen': 1, 'cult': 1, 'boa': 1, 'mccarthi': 1, 'englund': 1,
'best': 1, 'direct': 1, 'known': 1, 'favorit': 1, 'movi': 1, 'keith': 1,
'karat': 1, 'small': 1, 'classic': 1, 'coogan': 1, 'like': 1, 'elm': 1,
'fame': 1, 'malibu': 1, 'sever': 1, 'richard': 1, 'scenario': 1, 'town': 1,
'friday': 1, 'david': 1, 'unleash': 1, 'vs': 1, '2000': 1, 'angel': 1, 'nightmar': 1,
'zabka': 1, '13th': 1, 'jenni': 1, 'seri': 1, 'horror': 1, 'william': 1,
'street': 1, 'wil': 1, 'escap': 1, 'van': 1, 'snake': 1, 'evid': 1, 'freddi': 1,
'bow': 1, 'dien': 1, 'follow': 1, 'engin': 1, 'also': 1})
def tf(word, count): return count[word] / sum(count.values()) def n_containing(word, count_list): return sum(1 for count in count_list if word in count) def idf(word, count_list): return math.log(len(count_list) / (1 + n_containing(word, count_list))) def tfidf(word, count, count_list): return tf(word, count) * idf(word, count_list)
countlist = [count1, count2, count3]
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:3]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
Top words in document 1 Word: film, TF-IDF: 0.02829 Word: madefortv, TF-IDF: 0.00943 Word: california, TF-IDF: 0.00943 Top words in document 2 Word: genu, TF-IDF: 0.03686 Word: 7, TF-IDF: 0.01843 Word: among, TF-IDF: 0.01843 Top words in document 3 Word: revolv, TF-IDF: 0.02097 Word: colt, TF-IDF: 0.02097 Word: manufactur, TF-IDF: 0.01398
>>> corpus = ['This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',]
>>> vectorizer = TfidfVectorizer(min_df=1)
>>> vectorizer.fit_transform(corpus)
<4x9 sparse matrix of type '<class 'numpy.float64'>'
with 19 stored elements in Compressed Sparse Row format>
>>> vectorizer.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
>>> vectorizer.fit_transform(corpus).toarray()
array([[ 0. , 0.43877674, 0.54197657, 0.43877674, 0. ,
0. , 0.35872874, 0. , 0.43877674],
[ 0. , 0.27230147, 0. , 0.27230147, 0. ,
0.85322574, 0.22262429, 0. , 0.27230147],
[ 0.55280532, 0. , 0. , 0. , 0.55280532,
0. , 0.28847675, 0.55280532, 0. ],
[ 0. , 0.43877674, 0.54197657, 0.43877674, 0. ,
0. , 0.35872874, 0. , 0.43877674]])
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有