# 台大这门深度学习自然语言处理课程，可能被低估了

# 词向量游戏：一些有意思的例子

=======不错的词类比（Word Analogy）例子======

# 玩转腾讯词向量：Game of Words（词语的加减游戏）

get_nns_by_vector(v, n, search_k=-1, include_distances=False) same but query by vector v.

 In [49]: result = tc_wv_model.most_similar(positive=[u'国王', u'女人'], negative ...: =[u'男人'])   In [50]: print("%s\t%.4f" % result[0]) 王后 0.7050

 In [53]: result = tc_wv_model.most_similar(positive=[u'皇帝', u'女人'], negative ...: =[u'男人'])   In [54]: print("%s\t%.4f" % result[0]) 皇后 0.8759

 In [60]: result = tc_wv_model.most_similar(positive=[u'机场', u'火车'], negative ...: =[u'飞机'])   In [61]: print("%s\t%.4f" % result[0]) 火车站 0.7885

# 腾讯词向量实战：通过Annoy进行索引和快速查询

Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data.

 In [1]: import random   In [2]: from annoy import AnnoyIndex   # f是向量维度 In [3]: f = 20   In [4]: t = AnnoyIndex(f)   In [5]: for i in xrange(100): ...: v = [random.gauss(0, 1) for z in xrange(f)] ...: t.add_item(i, v) ...:   In [6]: t.build(10) Out[6]: True   In [7]: t.save('test.ann.index') Out[7]: True   In [8]: print(t.get_nns_by_item(0, 10)) [0, 45, 16, 17, 61, 24, 48, 20, 29, 84]   # 此处测试从硬盘盘索引加载 In [10]: u = AnnoyIndex(f)   In [11]: u.load('test.ann.index') Out[11]: True   In [12]: print(u.get_nns_by_item(0, 10)) [0, 45, 16, 17, 61, 24, 48, 20, 29, 84]

# 玩转腾讯词向量：词语相似度计算和在线查询

from gensim.models.word2vec import KeyedVectors
wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False)

# 相似词查询：玩转腾讯 AI Lab 中文词向量

# 维基百科语料中的词语相似度探索

WikiExtractor.py is a Python script that extracts and cleans text from a Wikipedia database dump.
The tool is written in Python and requires Python 2.7 or Python 3.3+ but no additional library.

WikiExtractor是一个Python 脚本，专门用于提取和清洗Wikipedia的dump数据，支持Python 2.7 或者 Python 3.3+，无额外依赖，安装和使用都非常方便：

git clone https://github.com/attardi/wikiextractor.git
cd wikiextractor/
sudo python setup.py install

WikiExtractor.py -o enwiki enwiki-latest-pages-articles.xml.bz2

 ...... INFO: 53665431 Pampapaul INFO: 53665433 Charles Frederick Zimpel INFO: Finished 11-process extraction of 5375019 articles in 8363.5s (642.7 art/s)

-b n[KMG], --bytes n[KMG] maximum bytes per output file (default 1M)

 #!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Pan Yang (panyangnlp@gmail.com) # Copyright 2017 @ Yu Zhen   import gensim import logging import multiprocessing import os import re import sys   from pattern.en import tokenize from time import time   logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)     def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, ' ', raw_html) return cleantext     class MySentences(object): def __init__(self, dirname): self.dirname = dirname   def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename for line in open(file_path): sline = line.strip() if sline == "": continue rline = cleanhtml(sline) tokenized_line = ' '.join(tokenize(rline)) is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()] yield is_alpha_word_line     if __name__ == '__main__': if len(sys.argv) != 2: print "Please use python train_with_gensim.py data_path" exit() data_path = sys.argv[1] begin = time()   sentences = MySentences(data_path) model = gensim.models.Word2Vec(sentences, size=200, window=10, min_count=10, workers=multiprocessing.cpu_count()) model.save("data/model/word2vec_gensim") model.wv.save_word2vec_format("data/model/word2vec_org", "data/model/vocabulary", binary=False)   end = time() print "Total procesing time: %d seconds" % (end - begin)

python train_word2vec_with_gensim.py enwiki

 2017-04-22 14:31:04,703 : INFO : collecting all words and their counts 2017-04-22 14:31:04,704 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types 2017-04-22 14:31:06,442 : INFO : PROGRESS: at sentence #10000, processed 480546 words, keeping 33925 word types 2017-04-22 14:31:08,104 : INFO : PROGRESS: at sentence #20000, processed 983240 words, keeping 51765 word types 2017-04-22 14:31:09,685 : INFO : PROGRESS: at sentence #30000, processed 1455218 words, keeping 64982 word types 2017-04-22 14:31:11,349 : INFO : PROGRESS: at sentence #40000, processed 1957479 words, keeping 76112 word types ...... 2017-04-23 02:50:59,844 : INFO : worker thread finished; awaiting finish of 2 more threads 2017-04-23 02:50:59,844 : INFO : worker thread finished; awaiting finish of 1 more threads 2017-04-23 02:50:59,854 : INFO : worker thread finished; awaiting finish of 0 more threads 2017-04-23 02:50:59,854 : INFO : training on 8903084745 raw words (6742578791 effective words) took 37805.2s, 178351 effective words/s 2017-04-23 02:50:59,855 : INFO : saving Word2Vec object under data/model/word2vec_gensim, separately None 2017-04-23 02:50:59,855 : INFO : not storing attribute syn0norm 2017-04-23 02:50:59,855 : INFO : storing np array 'syn0' to data/model/word2vec_gensim.wv.syn0.npy 2017-04-23 02:51:00,241 : INFO : storing np array 'syn1neg' to data/model/word2vec_gensim.syn1neg.npy 2017-04-23 02:51:00,574 : INFO : not storing attribute cum_table 2017-04-23 02:51:13,886 : INFO : saved data/model/word2vec_gensim 2017-04-23 02:51:13,886 : INFO : storing vocabulary in data/model/vocabulary 2017-04-23 02:51:17,480 : INFO : storing 868777x200 projection weights into data/model/word2vec_org Total procesing time: 44476 seconds

 textminer@textminer:/opt/wiki/data\$ ipython Python 2.7.12 (default, Nov 19 2016, 06:48:10) Type "copyright", "credits" or "license" for more information.   IPython 2.4.1 -- An enhanced Interactive Python. ? -> Introduction and overview of IPython's features. %quickref -> Quick reference. help -> Python's own help system. object? -> Details about 'object', use 'object??' for extra details.   In [1]: from gensim.models import Word2Vec   In [2]: en_wiki_word2vec_model = Word2Vec.load('data/model/word2vec_gensim')

word:

 In [3]: en_wiki_word2vec_model.most_similar('word') Out[3]: [('phrase', 0.8129693269729614), ('meaning', 0.7311851978302002), ('words', 0.7010501623153687), ('adjective', 0.6805518865585327), ('noun', 0.6461974382400513), ('suffix', 0.6440576314926147), ('verb', 0.6319557428359985), ('loanword', 0.6262609958648682), ('proverb', 0.6240501403808594), ('pronunciation', 0.6105246543884277)]

 In [4]: en_wiki_word2vec_model.most_similar('similarity') Out[4]: [('similarities', 0.8517599701881409), ('resemblance', 0.786037266254425), ('resemblances', 0.7496883869171143), ('affinities', 0.6571112275123596), ('differences', 0.6465682983398438), ('dissimilarities', 0.6212711930274963), ('correlation', 0.6071442365646362), ('dissimilarity', 0.6062943935394287), ('variation', 0.5970577001571655), ('difference', 0.5928016901016235)]

nlp:

 In [5]: en_wiki_word2vec_model.most_similar('nlp') Out[5]: [('neurolinguistic', 0.6698148250579834), ('psycholinguistic', 0.6388964056968689), ('connectionism', 0.6027182936668396), ('semantics', 0.5866401195526123), ('connectionist', 0.5865628719329834), ('bandler', 0.5837364196777344), ('phonics', 0.5733655691146851), ('psycholinguistics', 0.5613113641738892), ('bootstrapping', 0.559638261795044), ('psychometrics', 0.5555593967437744)]

 In [6]: en_wiki_word2vec_model.most_similar('learn') Out[6]: [('teach', 0.7533557415008545), ('understand', 0.71148681640625), ('discover', 0.6749690771102905), ('learned', 0.6599283218383789), ('realize', 0.6390970349311829), ('find', 0.6308424472808838), ('know', 0.6171890497207642), ('tell', 0.6146825551986694), ('inform', 0.6008728742599487), ('instruct', 0.5998791456222534)]

man:

 In [7]: en_wiki_word2vec_model.most_similar('man') Out[7]: [('woman', 0.7243080735206604), ('boy', 0.7029494047164917), ('girl', 0.6441491842269897), ('stranger', 0.63275545835495), ('drunkard', 0.6136815547943115), ('gentleman', 0.6122575998306274), ('lover', 0.6108279228210449), ('thief', 0.609005331993103), ('beggar', 0.6083744764328003), ('person', 0.597919225692749)]

 In [8]: en_wiki_word2vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1) Out[8]: [('queen', 0.7752252817153931)]   In [9]: en_wiki_word2vec_model.similarity('woman', 'man') Out[9]: 0.72430799548282099   In [10]: en_wiki_word2vec_model.doesnt_match("breakfast cereal dinner lunch".split()) Out[10]: 'cereal'

# 斯坦福大学深度学习与自然语言处理第四讲：词窗口分类和神经网络

