├── README.MD ├── cutdata.py ├── data ├── data.txt └── trained.txt ├── model └── classify.model ├── test.py └── train.py /README.MD: -------------------------------------------------------------------------------- 1 | # fasttext简单实践 2 | 3 | 4 | ## 运行环境: 5 | 6 | python3.5 7 | 8 | ## 依赖工具 9 | 10 | * [fastText](https://github.com/facebookresearch/fastText) 11 | * [jieba](https://github.com/fxsjy/jieba) 12 | 13 | 14 | ## 目录结构 15 | 16 | ``` 17 | project 18 | │ README.md 19 | │ cutdata.py #用于把data.txt转换为切割后trained.txt 20 | │ train.py #模型训练部分 21 | │ test.py #测试模型 22 | │ 23 | └───model 24 | │ │ classify.model #训练出来的模型 25 | │ 26 | └───data 27 | │ data.txt #未分词数据 28 | │ trained.txt #分词后的数据 29 | ``` -------------------------------------------------------------------------------- /cutdata.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | 3 | inFile = 'data/data.txt' 4 | outFile = 'data/trained.txt' 5 | 6 | f = open(inFile,'r') 7 | writer = open(outFile,'w') 8 | for line in f.readlines(): 9 | splitor = line.split() 10 | # 默认切割方法 11 | text = jieba.cut_for_search(splitor[0]) 12 | text = " ".join(text) + " " + splitor[1] + '\n' 13 | writer.writelines(text) 14 | 15 | f.close() 16 | writer.close() 17 | -------------------------------------------------------------------------------- /model/classify.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mountainguan/fastText_learning/c5b9135951b7fc6d2ce62a7656fd67cdc75bd7e7/model/classify.model -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | import fastText 3 | 4 | # input="石碣第十一中学" 5 | input = "东莞理工学院" 6 | # input = "胜和塘贝村村北十二巷1-103号" 7 | 8 | text = jieba.cut_for_search(input) 9 | text = " ".join(text) 10 | 11 | classify = fastText.load_model("model/classify.model") 12 | result = classify.predict(text) 13 | 14 | print("预测词:" + input + "\n") 15 | 16 | print("预测结果:") 17 | print(result) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf8 -*- 2 | import logging 3 | import fastText 4 | import jieba 5 | # help(fastText.FastText) 6 | # exit() 7 | input="清华大学" 8 | 9 | text = jieba.cut_for_search(input) 10 | text = " ".join(text) 11 | 12 | # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 13 | 14 | classifier = fastText.train_supervised("data/trained.txt",epoch=10,ws=2 , lr=0.5, wordNgrams=1, dim=1000,label=u"__label__",loss=u'softmax') 15 | 16 | classifier.save_model("model/classify.model") 17 | 18 | result = classifier.predict(text) 19 | 20 | print("预测词:" + input + "\n") 21 | 22 | print("预测结果:") 23 | print( result) --------------------------------------------------------------------------------