├── predictive ├── __init__.py ├── tests.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── import_vocabulary.py │ │ └── analyze_articles.py ├── migrations │ ├── __init__.py │ ├── 0002_delete_vocabularyrelation.py │ ├── 0004_auto_20180114_1420.py │ ├── 0005_auto_20180205_1051.py │ ├── 0001_initial.py │ └── 0003_auto_20180113_2319.py ├── apps.py ├── urls.py ├── serializers.py ├── cache_keys.py ├── models.py ├── utils.py └── views.py ├── resource_collector ├── tests.py ├── views.py ├── __init__.py ├── crawler │ ├── __init__.py │ ├── multithread_crawler.py │ ├── voa.py │ └── utils.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── crawl_todays_article.py │ │ └── init_data.py ├── migrations │ ├── __init__.py │ └── 0001_initial.py ├── apps.py └── models.py ├── demo.gif ├── predictive_text ├── __init__.py ├── urls.py ├── wsgi.py └── settings.py ├── requirements.txt ├── manage.py ├── README.md ├── .gitignore └── api.apib /predictive/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /predictive/tests.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/tests.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/views.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /predictive/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /predictive/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /predictive/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/management/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /resource_collector/management/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JeffyLu/predictive-text/HEAD/demo.gif -------------------------------------------------------------------------------- /predictive_text/__init__.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | pymysql.install_as_MySQLdb() 3 | -------------------------------------------------------------------------------- /predictive/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class PredictiveConfig(AppConfig): 5 | name = 'predictive' 6 | -------------------------------------------------------------------------------- /resource_collector/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ResourceCollectorConfig(AppConfig): 5 | name = 'resource_collector' 6 | -------------------------------------------------------------------------------- /predictive_text/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url, include 2 | 3 | urlpatterns = [ 4 | url(r'^api/v1/predictive/', include('predictive.urls')) 5 | ] 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.11 2 | PyMySQL==0.7.11 3 | requests==2.18.4 4 | django-crontab==0.7.1 5 | djangorestframework==3.7.7 6 | django-redis==4.8.0 7 | django-cors-headers==2.2.0 8 | -------------------------------------------------------------------------------- /resource_collector/management/commands/crawl_todays_article.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from resource_collector.crawler.voa import run_crawler 3 | 4 | 5 | class Command(BaseCommand): 6 | 7 | def handle(self, *args, **options): 8 | run_crawler(crawl_today=True) 9 | -------------------------------------------------------------------------------- /resource_collector/management/commands/init_data.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from resource_collector.crawler.voa import run_crawler 3 | 4 | 5 | class Command(BaseCommand): 6 | 7 | def add_arguments(self, parser): 8 | parser.add_argument('-c', '--cpu', dest='cpu', type=int) 9 | 10 | def handle(self, *args, **options): 11 | cpu = options.get('cpu') or 4 12 | run_crawler(cpu=cpu) 13 | -------------------------------------------------------------------------------- /predictive/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url, include 2 | from rest_framework import routers 3 | from predictive import views 4 | 5 | 6 | router = routers.DefaultRouter() 7 | router.register(r'vocabularies', views.VocabularyViewSet, 8 | base_name='invitations') 9 | router.register(r'phrases', views.PhraseViewSet, 10 | base_name='phrases') 11 | 12 | urlpatterns = [ 13 | url(r'^', include(router.urls)) 14 | ] 15 | -------------------------------------------------------------------------------- /predictive/migrations/0002_delete_vocabularyrelation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.4 on 2018-01-13 22:22 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('predictive', '0001_initial'), 12 | ] 13 | 14 | operations = [ 15 | migrations.DeleteModel( 16 | name='VocabularyRelation', 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /predictive_text/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for predictive_text project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.11/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "predictive_text.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /predictive/serializers.py: -------------------------------------------------------------------------------- 1 | from predictive.models import Vocabulary, VocabularyRelation 2 | from rest_framework.serializers import ModelSerializer 3 | 4 | 5 | class VocabularySerializer(ModelSerializer): 6 | 7 | class Meta: 8 | model = Vocabulary 9 | fields = ['id', 'word', 'frequency'] 10 | 11 | 12 | class PhraseSerializer(ModelSerializer): 13 | 14 | class Meta: 15 | model = VocabularyRelation.get_sharding_model(1) 16 | fields = ['id', 'word', 'next_word', 'frequency'] 17 | -------------------------------------------------------------------------------- /predictive/cache_keys.py: -------------------------------------------------------------------------------- 1 | def apply_prefix(f): 2 | def wrapper(*args, **kwargs): 3 | name = f.__name__.lstrip('key_of_') 4 | key = '{}.{}'.format(name, f(*args, **kwargs)) 5 | return key 6 | return wrapper 7 | 8 | 9 | @apply_prefix 10 | def key_of_vocabulary(word): 11 | return word 12 | 13 | 14 | @apply_prefix 15 | def key_of_relation(vocab_id, next_vocab_id): 16 | return '{}.{}'.format(vocab_id, next_vocab_id) 17 | 18 | 19 | @apply_prefix 20 | def key_of_vocabulary_queryset(prefix): 21 | return prefix 22 | 23 | 24 | @apply_prefix 25 | def key_of_phrase_queryset(word): 26 | return word 27 | 28 | 29 | @apply_prefix 30 | def key_of_user_vocabulary(ip): 31 | return ip 32 | 33 | 34 | @apply_prefix 35 | def key_of_user_relation(ip): 36 | return ip 37 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "predictive_text.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /resource_collector/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | 4 | class Article(models.Model): 5 | 6 | source_url = models.CharField(max_length=255) 7 | source_url_hash = models.CharField(max_length=32, unique=True) 8 | is_used = models.BooleanField(default=False) 9 | created_at = models.DateTimeField(auto_now_add=True) 10 | updated_at = models.DateTimeField(auto_now=True) 11 | 12 | @property 13 | def article_content(self): 14 | try: 15 | ac = ArticleContent.objects.get(article_id=self.pk) 16 | except ArticleContent.DoesNotExist: 17 | return '' 18 | return ac.content 19 | 20 | 21 | class ArticleContent(models.Model): 22 | 23 | article_id = models.BigIntegerField(unique=True) 24 | content = models.TextField() 25 | created_at = models.DateTimeField(auto_now_add=True) 26 | updated_at = models.DateTimeField(auto_now=True) 27 | -------------------------------------------------------------------------------- /predictive/management/commands/import_vocabulary.py: -------------------------------------------------------------------------------- 1 | from predictive.models import Vocabulary 2 | from django.db import IntegrityError 3 | from django.core.management.base import BaseCommand 4 | 5 | 6 | class Command(BaseCommand): 7 | 8 | def add_arguments(self, parser): 9 | parser.add_argument('-f', '--filename', dest='filename') 10 | 11 | def handle(self, *args, **options): 12 | filename = options.get('filename', '').strip() 13 | if not filename: 14 | print('{} not found') 15 | return 16 | with open(filename, 'r') as f: 17 | imported = 0 18 | for line in f: 19 | word = line.strip() 20 | if not word or len(word) >= 32: 21 | continue 22 | try: 23 | Vocabulary.objects.create(word=word) 24 | imported += 1 25 | except IntegrityError: 26 | pass 27 | print('total imported: {}'.format(imported)) 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # predictive-text 2 | 3 | - - - 4 | 5 | ### 环境依赖 6 | - MySQL 7 | - Redis 8 | - Python3 9 | 10 | 11 | ### 使用方法 12 | 在[settings.py](predictive_text/settings.py)中修改数据库和缓存的配置信息,然后装pip包。 13 | ``` 14 | $ pip3 install -r requirements.txt 15 | ``` 16 | 17 | 下载英文词汇[words.txt](https://github.com/dwyl/english-words/blob/master/words.txt)并初始化数据。 18 | ``` 19 | $ python3 manage.py migrate 20 | $ python3 manage.py init_data -c [num of cup] 21 | $ python3 manage.py import_vocabulary -f [path to words.txt] 22 | ``` 23 | 24 | 注册定时任务,两个任务分别用来更新每日新闻和分析一定数量的文章,从而提高补全或提示的准确性。 25 | ``` 26 | $ python3 manage.py crontab add 27 | ``` 28 | 29 | 运行。 30 | ``` 31 | $ python3 manage.py runserver 32 | ``` 33 | 34 | 35 | ### API文档 36 | [文档](api.apib)是基于APIBlueprint语法写的,可以使用[aglio](https://github.com/danielgtaylor/aglio)来渲染。 37 | ``` 38 | $ aglio -i api.apib -s 39 | ``` 40 | 访问```localhost:3000```效果如下: 41 |  42 | 43 | 44 | ### 系统效果 45 | - [前端源码](https://github.com/Hsuer/predictive-text-web) 46 |  47 | -------------------------------------------------------------------------------- /predictive/migrations/0004_auto_20180114_1420.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.4 on 2018-01-14 14:20 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('predictive', '0003_auto_20180113_2319'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AlterField( 16 | model_name='vocabularyrelation_0', 17 | name='vocab_id', 18 | field=models.BigIntegerField(), 19 | ), 20 | migrations.AlterField( 21 | model_name='vocabularyrelation_1', 22 | name='vocab_id', 23 | field=models.BigIntegerField(), 24 | ), 25 | migrations.AlterField( 26 | model_name='vocabularyrelation_2', 27 | name='vocab_id', 28 | field=models.BigIntegerField(), 29 | ), 30 | migrations.AlterField( 31 | model_name='vocabularyrelation_3', 32 | name='vocab_id', 33 | field=models.BigIntegerField(), 34 | ), 35 | ] 36 | -------------------------------------------------------------------------------- /predictive/migrations/0005_auto_20180205_1051.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.4 on 2018-02-05 10:51 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('predictive', '0004_auto_20180114_1420'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AlterModelOptions( 16 | name='vocabulary', 17 | options={'ordering': ['-frequency', 'word']}, 18 | ), 19 | migrations.AlterModelOptions( 20 | name='vocabularyrelation_0', 21 | options={'ordering': ['-frequency']}, 22 | ), 23 | migrations.AlterModelOptions( 24 | name='vocabularyrelation_1', 25 | options={'ordering': ['-frequency']}, 26 | ), 27 | migrations.AlterModelOptions( 28 | name='vocabularyrelation_2', 29 | options={'ordering': ['-frequency']}, 30 | ), 31 | migrations.AlterModelOptions( 32 | name='vocabularyrelation_3', 33 | options={'ordering': ['-frequency']}, 34 | ), 35 | ] 36 | -------------------------------------------------------------------------------- /predictive/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.4 on 2018-01-10 22:30 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | initial = True 11 | 12 | dependencies = [ 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name='Vocabulary', 18 | fields=[ 19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 20 | ('word', models.CharField(max_length=32, unique=True)), 21 | ('frequency', models.IntegerField(default=0)), 22 | ('created_at', models.DateTimeField(auto_now_add=True)), 23 | ('updated_at', models.DateTimeField(auto_now=True)), 24 | ], 25 | ), 26 | migrations.CreateModel( 27 | name='VocabularyRelation', 28 | fields=[ 29 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 30 | ('vocab_id', models.BigIntegerField(db_index=True)), 31 | ('next_vocab_id', models.BigIntegerField()), 32 | ('frequency', models.IntegerField(default=0)), 33 | ('created_at', models.DateTimeField(auto_now_add=True)), 34 | ('updated_at', models.DateTimeField(auto_now=True)), 35 | ], 36 | ), 37 | ] 38 | -------------------------------------------------------------------------------- /resource_collector/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.11.4 on 2018-01-07 03:39 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | initial = True 11 | 12 | dependencies = [ 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name='Article', 18 | fields=[ 19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 20 | ('source_url', models.CharField(max_length=255)), 21 | ('source_url_hash', models.CharField(max_length=32, unique=True)), 22 | ('is_used', models.BooleanField(default=False)), 23 | ('created_at', models.DateTimeField(auto_now_add=True)), 24 | ('updated_at', models.DateTimeField(auto_now=True)), 25 | ], 26 | ), 27 | migrations.CreateModel( 28 | name='ArticleContent', 29 | fields=[ 30 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 31 | ('article_id', models.BigIntegerField(unique=True)), 32 | ('content', models.TextField()), 33 | ('created_at', models.DateTimeField(auto_now_add=True)), 34 | ('updated_at', models.DateTimeField(auto_now=True)), 35 | ], 36 | ), 37 | ] 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/ 2 | .DS_Store 3 | *.swp 4 | *~ 5 | *.sqlite3 6 | *.db 7 | data/VOA/ 8 | data/words.txt 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | -------------------------------------------------------------------------------- /resource_collector/crawler/multithread_crawler.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from django.db.utils import InternalError 3 | from hashlib import md5 4 | from resource_collector.models import Article, ArticleContent 5 | 6 | 7 | class MultiThreadCrawler(threading.Thread): 8 | 9 | TASK_TYPE_ARTICLE = 0 10 | TASK_TYPE_ARCHIVER = 1 11 | 12 | def __init__(self, crawler, task_queue): 13 | super(MultiThreadCrawler, self).__init__() 14 | self.task_queue = task_queue 15 | self.crawler = crawler 16 | self.error_task = set() 17 | 18 | def save_article(self, url, content): 19 | url_hash = md5(url.encode('utf-8')).hexdigest() 20 | try: 21 | article, _ = Article.objects.get_or_create( 22 | source_url_hash=url_hash, defaults={'source_url': url}) 23 | ArticleContent.objects.update_or_create( 24 | article_id=article.pk, defaults={'content': content}) 25 | return True 26 | except InternalError as e: 27 | print('db internal error: {}'.format(str(e))) 28 | if url_hash not in self.error_task: 29 | self.task_queue.put((url, self.TASK_TYPE_ARTICLE)) 30 | self.error_task.add(url_hash) 31 | return False 32 | 33 | def run(self): 34 | while not self.task_queue.empty(): 35 | task, task_type = self.task_queue.get() 36 | if task_type == self.TASK_TYPE_ARTICLE: 37 | content = self.crawler.get_article_content_by_url(task) 38 | if not self.save_article(task, content): 39 | continue 40 | elif task_type == self.TASK_TYPE_ARCHIVER: 41 | urls = self.crawler.get_archiver_article_urls(task) 42 | for url in urls: 43 | self.task_queue.put((url, self.TASK_TYPE_ARTICLE)) 44 | self.task_queue.task_done() 45 | 46 | print('remain: {}, current: {}'.format( 47 | self.task_queue.qsize(), task)) 48 | -------------------------------------------------------------------------------- /api.apib: -------------------------------------------------------------------------------- 1 | FORMAT: v1 2 | HOST: http://localhost/api/v1/predictive 3 | 4 | # Group Predictive-Text 5 | 6 | ## Vocabulary Collection [/vocabularies/{?ipp,page,prefix,user_mode}] 7 | 8 | ### 根据前缀补全单词 [GET] 9 | + Parameters 10 | + ipp: 7 (number, optional) - 每页提示个数,默认7个 11 | + page: 1 (number, optional) - 当前页,默认1 12 | + prefix: `th` (string, required) - 单词前缀 13 | + user_mode: 1 (number, optional) - 开启用户个性化提示 14 | 15 | + Response 200 (application/json) 16 | + Attributes (VocabularyResponse) 17 | 18 | ## Vocabulary [/vocabularies/{id}/] 19 | 20 | ### 更新用户数据 [PUT] 21 | + Parameters 22 | + id: 1 (number, required) - 单词id 23 | 24 | + Response 200 (application/json) 25 | + Attributes (VocabularyRead) 26 | 27 | 28 | ## Phrase Collection [/phrases/{?ipp,page,word,user_mode}] 29 | 30 | ### 根据单词提示短语 [GET] 31 | + Parameters 32 | + ipp: 7 (number, optional) - 每页提示个数,默认7个 33 | + page: 1 (number, optional) - 当前页,默认1 34 | + word: `this` (string, required) - 单词 35 | + user_mode: 1 (number, optional) - 开启用户个性化提示 36 | 37 | + Response 200 (application/json) 38 | + Attributes (PhraseResponse) 39 | 40 | ## Phrase [/phrases/{id}/{?vocab_id}] 41 | 42 | ### 更新用户数据 [PUT] 43 | + Parameters 44 | + id: 1 (number, required) - 短语id 45 | 46 | + Request (application/json) 47 | + Attributes 48 | + vocab_id: 1 (number, required) - 短语的第一个单词id 49 | 50 | + Response 200 (application/json) 51 | + Attributes (PhraseRead) 52 | 53 | 54 | # Data Structures 55 | 56 | ## VocabularyRead 57 | + id: 1 (number) 58 | + word: `this` (string) - 单词 59 | + frequency: 1029 (number) - 频率 60 | 61 | ## PhraseRead 62 | + id: 1 (number) 63 | + word: `this` (string) - 单词 64 | + next_word: `is` (string) - 补全短语 65 | + frequency: 1029 (number) - 频率 66 | 67 | ## VocabularyResponse 68 | + ipp: 7 (number) 69 | + page: 1 (number) 70 | + total: 4 (number) 71 | + objects (array[VocabularyRead]) 72 | 73 | ## PhraseResponse 74 | + ipp: 7 (number) 75 | + page: 1 (number) 76 | + total: 4 (number) 77 | + objects (array[PhraseRead]) 78 | -------------------------------------------------------------------------------- /predictive/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | 4 | class Vocabulary(models.Model): 5 | 6 | word = models.CharField(max_length=32, unique=True) 7 | frequency = models.IntegerField(default=0) 8 | created_at = models.DateTimeField(auto_now_add=True) 9 | updated_at = models.DateTimeField(auto_now=True) 10 | 11 | def __str__(self): 12 | return '{}: {}'.format(self.word, self.frequency) 13 | 14 | class Meta: 15 | ordering = ['-frequency', 'word'] 16 | 17 | 18 | class VocabularyRelation(models.Model): 19 | 20 | vocab_id = models.BigIntegerField() 21 | next_vocab_id = models.BigIntegerField() 22 | frequency = models.IntegerField(default=0) 23 | created_at = models.DateTimeField(auto_now_add=True) 24 | updated_at = models.DateTimeField(auto_now=True) 25 | 26 | SHARDING_PIECE = 4 27 | SHARDING_MODEL = {} 28 | 29 | def __str__(self): 30 | return '{} {}: {}'.format(self.word, self.next_word, self.frequency) 31 | 32 | class Meta: 33 | unique_together = ['vocab_id', 'next_vocab_id'] 34 | ordering = ['-frequency'] 35 | abstract = True 36 | 37 | @property 38 | def word(self): 39 | vocab = Vocabulary.objects.filter(pk=self.vocab_id).first() 40 | return vocab.word if vocab else '' 41 | 42 | @property 43 | def next_word(self): 44 | vocab = Vocabulary.objects.filter(pk=self.next_vocab_id).first() 45 | return vocab.word if vocab else '' 46 | 47 | @classmethod 48 | def get_sharding_model(cls, sharding_key): 49 | piece = sharding_key % cls.SHARDING_PIECE 50 | 51 | class Meta: 52 | db_table = 'predictive_vocabularyrelation_{}'.format(piece) 53 | unique_together = ['vocab_id', 'next_vocab_id'] 54 | ordering = ['-frequency'] 55 | attrs = {'__module__': cls.__module__, 'Meta': Meta} 56 | class_name = 'VocabularyRelation_{}'.format(piece) 57 | if class_name not in cls.SHARDING_MODEL: 58 | cls.SHARDING_MODEL[class_name] = type(class_name, (cls, ), attrs) 59 | return cls.SHARDING_MODEL[class_name] 60 | 61 | @classmethod 62 | def count(cls): 63 | return sum(cls.get_sharding_model(i).objects.count() 64 | for i in range(cls.SHARDING_PIECE)) 65 | 66 | 67 | for i in range(VocabularyRelation.SHARDING_PIECE): 68 | VocabularyRelation.get_sharding_model(i) 69 | -------------------------------------------------------------------------------- /predictive_text/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | 6 | SECRET_KEY = 'l+38ezh8br+y8_-#iphf7xv4a5rzg&wl$5%5p27c*lk6@0m-xo' 7 | 8 | DEBUG = True 9 | 10 | ALLOWED_HOSTS = ['jeffy.me', 'localhost', '127.0.0.1'] 11 | 12 | CORS_ORIGIN_ALLOW_ALL = True 13 | 14 | INSTALLED_APPS = [ 15 | 'django.contrib.auth', 16 | 'django.contrib.contenttypes', 17 | 'django.contrib.sessions', 18 | 'corsheaders', 19 | 'django_crontab', 20 | 'rest_framework', 21 | 'resource_collector', 22 | 'predictive', 23 | ] 24 | 25 | MIDDLEWARE = [ 26 | 'django.middleware.security.SecurityMiddleware', 27 | 'django.contrib.sessions.middleware.SessionMiddleware', 28 | 'corsheaders.middleware.CorsMiddleware', 29 | 'django.middleware.common.CommonMiddleware', 30 | 'django.middleware.csrf.CsrfViewMiddleware', 31 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 32 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 33 | ] 34 | 35 | REST_FRAMEWORK = { 36 | 'DEFAULT_PAGINATION_CLASS': 'predictive.utils.PageNumberPaginationExt', 37 | 'DEFAULT_RENDERER_CLASSES': ('rest_framework.renderers.JSONRenderer',), 38 | 'PAGE_SIZE': 10, 39 | } 40 | 41 | ROOT_URLCONF = 'predictive_text.urls' 42 | 43 | WSGI_APPLICATION = 'predictive_text.wsgi.application' 44 | 45 | DATABASES = { 46 | 'default': { 47 | 'ENGINE': 'django.db.backends.mysql', 48 | 'NAME': 'predictive_text', 49 | 'USER': 'root', 50 | 'PASSWORD': 'root123456', 51 | 'HOST': 'localhost', 52 | 'PORT': '3306', 53 | } 54 | } 55 | 56 | CACHES = { 57 | "default": { 58 | "BACKEND": "django_redis.cache.RedisCache", 59 | "LOCATION": "redis://127.0.0.1:6379/1", 60 | "KEY_PREFIX": "django.predictive_text.caches", 61 | "TIMEOUT": 60 * 60 * 24, 62 | "OPTIONS": { 63 | "CLIENT_CLASS": "django_redis.client.DefaultClient", 64 | "PICKLE_VERSION": -1, 65 | } 66 | } 67 | } 68 | 69 | CRONJOBS = [ 70 | ( 71 | '0 0 * * *', 72 | 'django.core.management.call_command', 73 | ['crawl_todays_article'], 74 | {}, 75 | '>> %s/tmp/cronjobs/crawl_todays_article.log' % BASE_DIR, 76 | ), 77 | ( 78 | '0 1 * * *', 79 | 'django.core.management.call_command', 80 | ['analyze_articles'], 81 | {}, 82 | '>> %s/tmp/cronjobs/analyze_articles.log' % BASE_DIR, 83 | ), 84 | ] 85 | 86 | LANGUAGE_CODE = 'zh-hans' 87 | 88 | TIME_ZONE = 'Asia/Shanghai' 89 | 90 | USE_I18N = True 91 | 92 | USE_L10N = True 93 | 94 | USE_TZ = False 95 | -------------------------------------------------------------------------------- /predictive/management/commands/analyze_articles.py: -------------------------------------------------------------------------------- 1 | import re 2 | import nltk 3 | from django.core.cache import cache 4 | from predictive.cache_keys import key_of_vocabulary, key_of_relation 5 | from collections import Counter 6 | from django.core.management.base import BaseCommand 7 | from resource_collector.models import Article 8 | from predictive.models import Vocabulary, VocabularyRelation 9 | 10 | 11 | class Command(BaseCommand): 12 | 13 | def add_arguments(self, parser): 14 | parser.add_argument('-a', '--articles', dest='articles', type=int) 15 | 16 | def analyze(self, article_content): 17 | sents = nltk.sent_tokenize(article_content) 18 | for sent in sents: 19 | sent = sent.lower() 20 | words = re.findall(r'[0-9a-zA-Z\']+', sent) 21 | relations = [(words[w-1], words[w]) for w in range(1, len(words))] 22 | yield words, relations 23 | 24 | def handle(self, *args, **options): 25 | task_num = options.get('articles') or 100 26 | vocab_counter = Counter() 27 | vocab_relation_counter = Counter() 28 | articles = Article.objects.filter(is_used=False)[:task_num] 29 | for article in articles: 30 | for words, relations in self.analyze(article.article_content): 31 | vocab_counter.update(words) 32 | vocab_relation_counter.update(relations) 33 | article.is_used = True 34 | article.save() 35 | 36 | vocab_id_dict = {} 37 | for k, v in vocab_counter.items(): 38 | key = key_of_vocabulary(k) 39 | vocab = cache.get(key) 40 | if not vocab: 41 | try: 42 | vocab = Vocabulary.objects.get(word=k) 43 | except Vocabulary.DoesNotExist: 44 | continue 45 | vocab.frequency = vocab.frequency + v 46 | vocab.save() 47 | cache.set(key, vocab) 48 | vocab_id_dict[k] = vocab.pk 49 | 50 | for k, v in vocab_relation_counter.items(): 51 | vocab_id = vocab_id_dict.get(k[0]) 52 | next_vocab_id = vocab_id_dict.get(k[1]) 53 | if not all([vocab_id, next_vocab_id]): 54 | continue 55 | key = key_of_relation(vocab_id, next_vocab_id) 56 | relation = cache.get(key) 57 | if not relation: 58 | model = VocabularyRelation.get_sharding_model(vocab_id) 59 | relation, _ = model.objects.update_or_create( 60 | vocab_id=vocab_id, next_vocab_id=next_vocab_id, 61 | defaults={'frequency': v}) 62 | else: 63 | relation.frequency = relation.frequency + v 64 | relation.save() 65 | cache.set(key, relation) 66 | 67 | print('tasks: {}, vocabs: {}, relations: {}'.format( 68 | task_num, len(vocab_counter), len(vocab_relation_counter))) 69 | -------------------------------------------------------------------------------- /resource_collector/crawler/voa.py: -------------------------------------------------------------------------------- 1 | import re 2 | import queue 3 | import random 4 | from django.utils import timezone 5 | from resource_collector.crawler.utils import user_agents, get_html 6 | from resource_collector.crawler.multithread_crawler import MultiThreadCrawler 7 | 8 | 9 | class VOACrawler: 10 | 11 | url_root = 'http://www.51voa.com' 12 | url_standard = url_root + "/VOA_Standard_{}_archiver.html" 13 | 14 | def __init__(self, retry=3, encoding='utf-8'): 15 | self.retry = retry 16 | self.encoding = encoding 17 | 18 | @property 19 | def headers(self): 20 | return {'User-Agent': random.choice(user_agents)} 21 | 22 | @property 23 | def standard_archiver_pages_task_queue(self): 24 | url = self.url_standard.format(1) 25 | html = get_html(url, retry=self.retry, headers=self.headers, 26 | encoding=self.encoding) 27 | page = re.search(r'页次:\d+/(\d+) 每页', 28 | html, re.S) 29 | task_queue = queue.Queue() 30 | [task_queue.put((i, MultiThreadCrawler.TASK_TYPE_ARCHIVER)) 31 | for i in range(int(page.groups()[0]))] 32 | return task_queue 33 | 34 | @property 35 | def update_todays_article_queue(self): 36 | html = get_html(self.url_root, retry=self.retry, headers=self.headers, 37 | encoding=self.encoding) 38 | urls = re.findall( 39 | r'href="(/VOA_Standard_English/[a-zA-Z0-9_-]*?.html)"', html, re.S) 40 | task_queue = queue.Queue() 41 | for url in urls: 42 | task_queue.put( 43 | (self.url_root + url, MultiThreadCrawler.TASK_TYPE_ARTICLE) 44 | ) 45 | return task_queue 46 | 47 | def get_archiver_article_urls(self, page): 48 | url = self.url_standard.format(page) 49 | html = get_html(url, retry=self.retry, headers=self.headers, 50 | encoding=self.encoding) 51 | urls = re.findall( 52 | r'href="(/VOA_Standard_English/[a-zA-Z0-9_-]*?.html)"', html, re.S) 53 | return [self.url_root + u for u in urls] 54 | 55 | def get_article_content_by_url(self, url): 56 | html = get_html(url, retry=self.retry, headers=self.headers, 57 | encoding=self.encoding) 58 | contents = re.findall(r'