古腾堡语库:
import nltk
t = nltk.corpus.gutenberg.fileids()
print(t)#txt
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print(emma, len(emma))
emm = nltk.Text(emma)
print(emm, len(emm))
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():#统计每个文本
num_chars = len(gutenberg.raw(fileid))#字符数
num_words = len(gutenberg.words(fileid))#单词数
num_sents = len(gutenberg.sents(fileid))#句子数,一个句子是一个向量列表
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
num_chars // num_words#平均词长(写作风格)
num_words // num_sents#平局句子的单词数
num_words // num_vocab#本文中每个词出现的平均次数
(小说)
网络和聊天文本
from nltk.corpus import nps_chat
#2006年10月19号,20多岁 聊天室收集的706个帖子
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])
from nltk.corpus import webtext
for fileid in webtext.fileids():
#输出一个txt的前30个字符
print(fileid, webtext.raw(fileid)[:30])
布朗语料库
(新闻社论等)
from nltk.corpus import brown
import nltk
print(brown.categories())
news_text = brown.words(categories='news')
print(news_text)
fdist = nltk.FreqDist([w.lower() for w in news_text])
for m in ['can', 'could', 'must', 'will']:
print(fdist[m])
print(brown.words(fileids=['cg22']))
print(brown.sents(categories=['news', 'editorial', 'reviews']))
import nltk
#报错
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genres, word)
for genres in brown.categories()#类别
for word in brown.words(catagories=genres)#该类别的词汇
)
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance']
modals = ['can', 'could', 'may', 'might', 'must', ' will']
temp = cfd.tabulate(condition=genres, sample=modals)
print(temp)
路透社语料库
(10788新闻文档,130w字,90个主题)
from nltk.corpus import reuters
print(reuters.fileids())
print(reuters.categories())
print(reuters.categories("training/9865"))#类别
print(reuters.categories(["training/9865", "training/9880"]))
print(reuters.fileids('barley'))#内容名称
print(reuters.fileids(['barley', 'corn']))#或者
print(reuters.words("training/9865")[:14])
print(reuters.words(categories=['barley', 'corn']))
就职演说语料库
from nltk.corpus import inaugural
print(inaugural.fileids())#...txt
print([fileid[:4] for fileid in inaugural.fileids()])
from nltk.corpus import inaugural#报错
import nltk
cdf = nltk.ConditionalFreqDist(
(target, file[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
cdf.plot()
标注文本语料库,下载地址
其它语言的语料库
import nltk
#print(nltk.corpus.floresta.words())#error
print(nltk.corpus.udhr.fileids())#字体
print(nltk.corpus.udhr.words('Abkhaz-UTF8')[11:])
from nltk.corpus import udhr
import nltk
#语言种类频数图
languages = ['Chickasaw', 'English', 'German_Deutsch',
'Greenlandic_Inuktikut', 'Hungarian_Magyar']
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1')
)
cfd.plot(cumulative=True)
文本语料库的结构
from nltk.corpus import udhr, gutenberg
import nltk
raw = gutenberg.raw("burgess-busterbrown.txt")
print(raw[1:20])
words = gutenberg.words("burgess-busterbrown.txt")
print(words[1:20])
sents = gutenberg.sents("burgess-busterbrown.txt")
print(words[1:20])
载入自己的语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids())
from nltk.corpus import BracketParseCorpusReader
corpus_root = r"C:\corpora\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
print(ptb.fileids())
print(len(ptb.sents()))
print(ptb.sents(fileids='20/wsj_2013.mrg')[19])
(条件, 事件)
文体计数词汇
import nltk #按文体计数 词汇
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
print(cfd.conditions())#第一个值
print(cfd['humor'])#输出<FreqDist with 5017 samples and 21695 outcomes>
print(list(cfd['humor']))#正常值
print(cfd['humor']['could'])#个数
分布图和分布表
from nltk.corpus import inaugural
import nltk
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
from nltk.corpus import udhr
import nltk
languages = ['Chickasaw', 'English', 'German_Deutsch']
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1')
)
#range(10)长度小于10的字符,依次词出现的频率表
cfd.tabulate(conditions=['English', 'German_Deutsch'],
samples=range(10), cumulative=True)
产生随机文本
def generate_model(cfdist, word, num=15):
for i in range(num):
print(word)#max为上下文最可能的标志符
word = cfdist[word].max()
import nltk
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
print(cfd['living'])#打印不了,下句打印
generate_model(cfd, 'living')
图
linux的词汇语料库,在/usr/dict/words中
词汇列表语义库,删除所有现有词汇表中出现的元素,留下罕见或者拼写错误的词汇。
import nltk
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab.difference(english_vocab)
return sorted(unusual)
#nltk.corpus.words.words()词典,找text中出现少或错误的
print(unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt')))
print(unusual_words(nltk.corpus.nps_chat.words()))
停用词,包括高频词(the、to、and等)进一步文档过滤
from nltk.corpus import stopwords
import nltk
stopword = stopwords.words('english')
print(stopword)
text = nltk.corpus.reuters.words()
content = [w for w in text if w.lower() not in stopword]
#停用词的比例
print(len(content)/ len(text))