from mailbox import mbox
import pandas as pd
def store_content(message, body=None):
if not body:
body = message.get_payload(decode=True)
if len(message):
contents = {
"subject": message['subject'] or "",
"body": body,
"from": message['from'],
"to": message['to'],
"date": message['date'],
"labels": message['X-Gmail-Labels'],
"epilogue": message.epilogue,
}
return df.append(contents, ignore_index=True)
# Create an empty DataFrame with the relevant columns
df = pd.DataFrame(
columns=("subject", "body", "from", "to", "date", "labels", "epilogue"))
# Import your downloaded mbox file
box = mbox('All mail Including Spam and Trash.mbox')
fails = []
for message in box:
try:
if message.get_content_type() == 'text/plain':
df = store_content(message)
elif message.is_multipart():
# Grab any plaintext from multipart messages
for part in message.get_payload():
if part.get_content_type() == 'text/plain':
df = store_content(message, part.get_payload(decode=True))
break
except:
fails.append(message)
# Top 10 most common subject words
from collections import Counter
subject_word_bag = df.subject.apply(lambda t: t.lower() + " ").sum()
Counter(subject_word_bag.split()).most_common()[:10]
[('re:', 8508), ('-', 1188), ('the', 819), ('fwd:', 666), ('to', 572), ('new', 530), ('your', 528), ('for', 498), ('a', 463), ('course', 452)]
from nltk.corpus import stopwords
stops = [unicode(word) for word in stopwords.words('english')] + ['re:', 'fwd:', '-']
subject_words = [word for word in subject_word_bag.split() if word.lower() not in stops]
Counter(subject_words).most_common()[:10]
[('new', 530), ('course', 452), ('trackmaven', 334), ('question', 334), ('post', 286), ('content', 245), ('payment', 244), ('blog', 241), ('forum', 236), ('update', 220)]
from nltk import collocations
bigram_measures = collocations.BigramAssocMeasures()
bigram_finder = collocations.BigramCollocationFinder.from_words(subject_words)
# Filter to top 20 results; otherwise this will take a LONG time to analyze
bigram_finder.apply_freq_filter(20)
for bigram in bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]:
print bigram
(('forum', 'content'), 0.005839453284373725)
(('new', 'forum'), 0.005839453284373725)
(('blog', 'post'), 0.00538045695634435)
(('domain', 'names'), 0.004870461036311709)
(('alpha', 'release'), 0.0028304773561811506)
(('default', 'widget.'), 0.0026519787841697267)
(('purechat:', 'question'), 0.0026519787841697267)
(('using', 'default'), 0.0026519787841697267)
(('release', 'third'), 0.002575479396164831)
(('trackmaven', 'application'), 0.002524479804161567)
for bigram in bigram_finder.nbest(bigram_measures.pmi, 5):
print bigram
('4:30pm', '5pm')
('motley', 'fool')
('60,', '900,')
('population', 'cap')
('simple', 'goods')
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有