>>> from nltk.tokenizer import * >>> t = Token(TEXT='This is my first test sentence') >>> WSTokenizer().tokenize(t, addlocs=True) # break on whitespace >>> print t['TEXT'] This is my first test sentence >>> print t['SUBTOKENS'] [<This>@[0:4c], <is>@[5:7c], <my>@[8:10c], <first>@[11:16c], <test>@[17:21c], <sentence>@[22:30c]] >>> t['foo'] = 'bar' >>> t <TEXT='This is my first test sentence', foo='bar', SUBTOKENS=[<This>@[0:4c], <is>@[5:7c], <my>@[8:10c], <first>@[11:16c], <test>@[17:21c], <sentence>@[22:30c]]> >>> print t['SUBTOKENS'][0] <This>@[0:4c] >>> print type(t['SUBTOKENS'][0]) <class 'nltk.token.SafeToken'>
>>> from nltk.probability import *
>>> article = Token(TEXT=open('cp-b17.txt').read())
>>> WSTokenizer().tokenize(article)
>>> freq = FreqDist()
>>> for word in article['SUBTOKENS']:
... freq.inc(word['TEXT'])
>>> freq.B()
1194
>>> freq.count('Python')
12
>>> cf = ConditionalFreqDist() >>> for word in article['SUBTOKENS']: ... cf[word['TEXT'][0]].inc(len(word['TEXT'])) ... >>> init_letters = cf.conditions() >>> init_letters.sort() >>> for c in init_letters[44:50]: ... print "Init %s:" % c, ... for length in range(1,6): ... print "len %d/%.2f," % (length,cf[c].freq(n)), ... print ... Init a: len 1/0.03, len 2/0.03, len 3/0.03, len 4/0.03, len 5/0.03, Init b: len 1/0.12, len 2/0.12, len 3/0.12, len 4/0.12, len 5/0.12, Init c: len 1/0.06, len 2/0.06, len 3/0.06, len 4/0.06, len 5/0.06, Init d: len 1/0.06, len 2/0.06, len 3/0.06, len 4/0.06, len 5/0.06, Init e: len 1/0.18, len 2/0.18, len 3/0.18, len 4/0.18, len 5/0.18, Init f: len 1/0.25, len 2/0.25, len 3/0.25, len 4/0.25, len 5/0.25,
>>> from nltk.stemmer.porter import PorterStemmer
>>> PorterStemmer().stem_word('complications')
'complic'
>>> from nltk.tokenizer import *
>>> article = Token(TEXT=open('cp-b17.txt').read())
>>> WSTokenizer().tokenize(article)
>>> from nltk.probability import *
>>> from nltk.stemmer.porter import *
>>> stemmer = PorterStemmer()
>>> stems = FreqDist()
>>> for word in article['SUBTOKENS']:
... stemmer.stem(word)
... stems.inc(word['STEM'].lower())
...
>>> word_stems = stems.samples()
>>> word_stems.sort()
>>> word_stems[20:40]
['"generator-bas', '"implement', '"lazili', '"magic"', '"partial',
'"pluggable"', '"primitives"', '"repres', '"secur', '"semi-coroutines."',
'"state', '"understand', '"weightless', '"whatev', '#', '#-----',
'#----------', '#-------------', '#---------------', '#b17:']
>>> article = TS().text_splitter(open('cp-b17.txt').read())
>>> stems = FreqDist()
>>> for word in article:
... stems.inc(stemmer.stem_word(word.lower()))
...
>>> word_stems = stems.samples()
>>> word_stems.sort()
>>> word_stems[60:80]
['bool', 'both', 'boundari', 'brain', 'bring', 'built', 'but', 'byte',
'call', 'can', 'cannot', 'capabl', 'capit', 'carri', 'case', 'cast',
'certain', 'certainli', 'chang', 'charm']
>>> from nltk.parser.chunk import ChunkedTaggedTokenizer >>> chunked = "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]" >>> sentence = Token(TEXT=chunked) >>> tokenizer = ChunkedTaggedTokenizer(chunk_node='NP') >>> tokenizer.tokenize(sentence) >>> sentence['SUBTOKENS'][0] (NP: <the/DT> <little/JJ> <cat/NN>) >>> sentence['SUBTOKENS'][0]['NODE'] 'NP' >>> sentence['SUBTOKENS'][0]['CHILDREN'][0] <the/DT> >>> sentence['SUBTOKENS'][0]['CHILDREN'][0]['TAG'] 'DT' >>> chunk_structure = TreeToken(NODE='S', CHILDREN=sentence['SUBTOKENS']) (S: (NP: <the/DT> <little/JJ> <cat/NN>) <sat/VBD> <on/IN> (NP: <the/DT> <mat/NN>))
>>> rule1 = ChunkRule('<DT>?<JJ.*>*<NN.*>',
... 'Chunk optional det, zero or more adj, and a noun')
>>> chunkparser = RegexpChunkParser([rule1], chunk_node='NP', top_node='S')
>>> chunkparser.parse(sentence)
>>> print sent['TREE']
(S: (NP: <the/DT> <little/JJ> <cat/NN>)
<sat/VBD> <on/IN>
(NP: <the/DT> <mat/NN>))
>>> from nltk.parser.chart import *
>>> grammar = CFG.parse('''
... S -> NP VP
... VP -> V NP | VP PP
... V -> "saw" | "ate"
... NP -> "John" | "Mary" | "Bob" | Det N | NP PP
... Det -> "a" | "an" | "the" | "my"
... N -> "dog" | "cat" | "cookie"
... PP -> P NP
... P -> "on" | "by" | "with"
... ''')
>>> sentence = Token(TEXT='John saw a cat with my cookie')
>>> WSTokenizer().tokenize(sentence)
>>> parser = ChartParser(grammar, BU_STRATEGY, LEAF='TEXT')
>>> parser.parse_n(sentence)
>>> for tree in sentence['TREES']: print tree
(S:
(NP: <John>)
(VP:
(VP: (V: <saw>) (NP: (Det: <a>) (N: <cat>)))
(PP: (P: <with>) (NP: (Det: <my>) (N: <cookie>)))))
(S:
(NP: <John>)
(VP:
(V: <saw>)
(NP:
(NP: (Det: <a>) (N: <cat>))
(PP: (P: <with>) (NP: (Det: <my>) (N: <cookie>))))))
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有