>>> import nltk >>> nltk.download()
>>> sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
>>> paragraph = "The first time I heard that song was in Hawaii on radio.
... I was just a kid, and loved it very much! What a fantastic song!"
>>> sentences = sent_tokenizer.tokenize(paragraph)
>>> sentences
['The first time I heard that song was in Hawaii on radio.',
'I was just a kid, and loved it very much!',
'What a fantastic song!']
>>> from nltk.tokenize import WordPunctTokenizer >>> sentence = "Are you old enough to remember Michael Jackson attending ... the Grammys with Brooke Shields and Webster sat on his lap during the show?" >>> words = WordPunctTokenizer().tokenize(sentence) >>> words ['Are', 'you', 'old', 'enough', 'to', 'remember', 'Michael', 'Jackson', 'attending', 'the', 'Grammys', 'with', 'Brooke', 'Shields', 'and', 'Webster', 'sat', 'on', 'his', 'lap', 'during', 'the', 'show', '?']
>>> text = 'That U.S.A. poster-print costs $12.40...'
>>> text = 'That U.S.A. poster-print costs $12.40...' >>> pattern = r'''''(?x) # set flag to allow verbose regexps ... ([A-Z]\.)+ # abbreviations, e.g. U.S.A. ... | \w+(-\w+)* # words with optional internal hyphens ... | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% ... | \.\.\. # ellipsis ... | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ... ''' >>> nltk.regexp_tokenize(text, pattern)
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']
[('', '', ''),
('A.', '', ''),
('', '-print', ''),
('', '', ''),
('', '', '.40'),
('', '', '')]
pattern = r"""(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
|\d+(?:\.\d+)?%? # numbers, incl. currency and percentages
|\w+(?:[-']\w+)* # words w/ optional internal hyphens/apostrophe
|\.\.\. # ellipsis
|(?:[.,;"'?():-_`]) # special characters with meanings
"""
>>> nltk.regexp_tokenize(text, pattern) ['That', 'U.S.A.', 'poster-print', 'costs', '12.40', '...']
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有