sudo pip install nltk
sudo pip install pandas
sudo pip install ipython
mkdir Codes cd Codes ipython notebook
from pandas import DataFrame import pandas as pd d = ['pets insurance','pets insure','pet insurance','pet insur','pet insurance"','pet insu'] df = DataFrame(d) df.columns = ['Words'] df
import nltk tokenizer = nltk.RegexpTokenizer(r'w+')
df["Stemming Words"] = "" df["Count"] = 1
j = 0
while (j <= 5):
for word in tokenizer.tokenize(df["Words"][j]):
df["Stemming Words"][j] = df["Stemming Words"][j] + " " + nltk.PorterStemmer().stem_word(word)
j += 1
df
uniqueWords = df.groupby(['Stemming Words'], as_index = False).sum().sort(['Count']) uniqueWords
sudo pip install enchant
import enchant
from nltk.metrics import edit_distance
class SpellingReplacer(object):
def __init__(self, dict_name='en', max_dist=2):
self.spell_dict = enchant.Dict(dict_name)
self.max_dist = 2
def replace(self, word):
if self.spell_dict.check(word):
return word
suggestions = self.spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <=
self.max_dist:
return suggestions[0]
else:
return word
from replacers import SpellingReplacer
replacer = SpellingReplacer()
replacer.replace('insu')
'insu'
import Levenshtein
minDistance = 0.8
distance = -1
lastWord = ""
j = 0
while (j < 1):
lastWord = uniqueWords["Stemming Words"][j]
distance = Levenshtein.ratio(uniqueWords["Stemming Words"][j], uniqueWords["Stemming Words"][j + 1])
if (distance > minDistance):
uniqueWords["Stemming Words"][j] = uniqueWords["Stemming Words"][j + 1]
j += 1
uniqueWords
uniqueWords = uniqueWords.groupby(['Stemming Words'], as_index = False).sum() uniqueWords
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有