专栏文章

nlp_语料资源

桃之夭夭 2019-02-08

古腾堡语库：

    import nltk
    t = nltk.corpus.gutenberg.fileids()
    print(t)#txt
    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print(emma, len(emma))
    emm = nltk.Text(emma)
    print(emm, len(emm))

    from nltk.corpus import gutenberg
    for fileid in gutenberg.fileids():#统计每个文本
    num_chars = len(gutenberg.raw(fileid))#字符数
    num_words = len(gutenberg.words(fileid))#单词数
    num_sents = len(gutenberg.sents(fileid))#句子数，一个句子是一个向量列表
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
        num_chars // num_words#平均词长（写作风格）
    num_words // num_sents#平局句子的单词数
    num_words // num_vocab#本文中每个词出现的平均次数

(小说)

网络和聊天文本

    from nltk.corpus import nps_chat
    #2006年10月19号，20多岁聊天室收集的706个帖子
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    print(chatroom[123])

    from nltk.corpus import webtext
    for fileid in webtext.fileids():
    #输出一个txt的前30个字符
    print(fileid, webtext.raw(fileid)[:30])

布朗语料库

(新闻社论等)

    from nltk.corpus import brown
    import nltk
    print(brown.categories())
    news_text = brown.words(categories='news')
    print(news_text)
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    for m in ['can', 'could', 'must', 'will']:
        print(fdist[m])
    print(brown.words(fileids=['cg22']))
    print(brown.sents(categories=['news', 'editorial', 'reviews']))

    import nltk
    #报错
    from nltk.corpus import brown
    cfd = nltk.ConditionalFreqDist(
        (genres, word)
    for genres in brown.categories()#类别
    for word in brown.words(catagories=genres)#该类别的词汇
    )
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance']
    modals = ['can', 'could', 'may', 'might', 'must', ' will']
    temp = cfd.tabulate(condition=genres, sample=modals)
    print(temp)

路透社语料库

(10788新闻文档，130w字，90个主题)

    from nltk.corpus import reuters
    print(reuters.fileids())
    print(reuters.categories())
    print(reuters.categories("training/9865"))#类别
    print(reuters.categories(["training/9865", "training/9880"]))
    print(reuters.fileids('barley'))#内容名称
    print(reuters.fileids(['barley', 'corn']))#或者
    print(reuters.words("training/9865")[:14])
    print(reuters.words(categories=['barley', 'corn']))

就职演说语料库

    from nltk.corpus import inaugural
    print(inaugural.fileids())#...txt
    print([fileid[:4] for fileid in inaugural.fileids()])

    from nltk.corpus import inaugural#报错
    import nltk
    cdf = nltk.ConditionalFreqDist(
        (target, file[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
    )
    cdf.plot()

标注文本语料库，下载地址

nltk数据集下载 nltk数据集下载2

其它语言的语料库

    import nltk
    #print(nltk.corpus.floresta.words())#error
    print(nltk.corpus.udhr.fileids())#字体
    print(nltk.corpus.udhr.words('Abkhaz-UTF8')[11:])

    from nltk.corpus import udhr
    import nltk
    #语言种类频数图
    languages = ['Chickasaw', 'English', 'German_Deutsch',
    'Greenlandic_Inuktikut', 'Hungarian_Magyar']
    cfd = nltk.ConditionalFreqDist(
        (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1')
    )
    cfd.plot(cumulative=True)

文本语料库的结构

    from nltk.corpus import udhr, gutenberg
    import nltk
    raw = gutenberg.raw("burgess-busterbrown.txt")
    print(raw[1:20])
    words = gutenberg.words("burgess-busterbrown.txt")
    print(words[1:20])
    sents = gutenberg.sents("burgess-busterbrown.txt")
    print(words[1:20])

载入自己的语料库

    from nltk.corpus import PlaintextCorpusReader
    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    print(wordlists.fileids())

    from nltk.corpus import BracketParseCorpusReader
    corpus_root = r"C:\corpora\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg"
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    print(ptb.fileids())
    print(len(ptb.sents()))
    print(ptb.sents(fileids='20/wsj_2013.mrg')[19])

(条件，事件)

文体计数词汇

    import nltk #按文体计数词汇
    cfd = nltk.ConditionalFreqDist(
        (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
    )
    print(cfd.conditions())#第一个值
    print(cfd['humor'])#输出<FreqDist with 5017 samples and 21695 outcomes>
    print(list(cfd['humor']))#正常值
    print(cfd['humor']['could'])#个数

分布图和分布表

    from nltk.corpus import inaugural
    import nltk
    cfd = nltk.ConditionalFreqDist(
        (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
    )

    from nltk.corpus import udhr
    import nltk
    languages = ['Chickasaw', 'English', 'German_Deutsch']
    cfd = nltk.ConditionalFreqDist(
        (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1')
    )
    #range(10)长度小于10的字符，依次词出现的频率表
    cfd.tabulate(conditions=['English', 'German_Deutsch'],
    samples=range(10), cumulative=True)

产生随机文本

    def generate_model(cfdist, word, num=15):
        for i in range(num):
            print(word)#max为上下文最可能的标志符
            word = cfdist[word].max()
    import nltk
    text = nltk.corpus.genesis.words('english-kjv.txt')
    bigrams = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bigrams)
    print(cfd['living'])#打印不了，下句打印
    generate_model(cfd, 'living')

图

linux的词汇语料库，在/usr/dict/words中

词汇列表语义库，删除所有现有词汇表中出现的元素，留下罕见或者拼写错误的词汇。

    import nltk
    def unusual_words(text):
        text_vocab = set(w.lower() for w in text if w.isalpha())
        english_vocab = set(w.lower() for w in nltk.corpus.words.words())
        unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)
    #nltk.corpus.words.words()词典，找text中出现少或错误的
    print(unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt')))
    print(unusual_words(nltk.corpus.nps_chat.words()))

停用词，包括高频词（the、to、and等）进一步文档过滤

    from nltk.corpus import stopwords
    import nltk
    stopword = stopwords.words('english')
    print(stopword)
    text = nltk.corpus.reuters.words()
    content = [w for w in text if w.lower() not in stopword]
    #停用词的比例
    print(len(content)/ len(text))

# nlp # 长文章

版权归作者所有，转载请注明出处

桃之夭夭关注

热度 3

LOFTER-网易轻博