├── .gitignore ├── Daily_crawler ├── daily_crawler.cron ├── news_crawl.sh ├── push_urls.py └── start_crawl.sh ├── ETL ├── __init__.py ├── auto_embedding.py ├── auto_embedding_simhash.py ├── model_update.py ├── stop_words ├── train_model.py ├── train_step1.py └── train_step2.py ├── News_scrapy ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc ├── items.py ├── items.pyc ├── middlewares.py ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders │ ├── 36dsj.py │ ├── 36kr.py │ ├── 36kr.pyc │ ├── NetEasetech.py │ ├── NetEasetech.pyc │ ├── __init__.py │ ├── __init__.pyc │ ├── __pycache__ │ ├── 36dsj.cpython-36.pyc │ ├── 36kr.cpython-36.pyc │ ├── NetEasetech.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── aliresearch.cpython-36.pyc │ ├── caixin.cpython-36.pyc │ ├── cyzone.cpython-36.pyc │ ├── dgtle.cpython-36.pyc │ ├── donews.cpython-36.pyc │ ├── economist.cpython-36.pyc │ ├── ftchinese.cpython-36.pyc │ ├── guokr.cpython-36.pyc │ ├── huxiu.cpython-36.pyc │ ├── ifeng.cpython-36.pyc │ ├── ifeng_finace.cpython-36.pyc │ ├── infoQ.cpython-36.pyc │ ├── jiqizhixin.cpython-36.pyc │ ├── lijiresearch.cpython-36.pyc │ ├── rsarxiv.cpython-36.pyc │ ├── sougou.cpython-36.pyc │ ├── syncedreview.cpython-36.pyc │ ├── tech2opi.cpython-36.pyc │ ├── technode.cpython-36.pyc │ ├── techqq.cpython-36.pyc │ ├── techreview.cpython-36.pyc │ ├── techsina.cpython-36.pyc │ ├── techsohu.cpython-36.pyc │ ├── tmtpost.cpython-36.pyc │ ├── vcbeat.cpython-36.pyc │ ├── xtecher.cpython-36.pyc │ ├── zaker.cpython-36.pyc │ └── zhidx.cpython-36.pyc │ ├── aitists.py │ ├── aliresearch.py │ ├── aliresearch.pyc │ ├── caixin.py │ ├── caixin.pyc │ ├── cyzone.py │ ├── cyzone.pyc │ ├── dgtle.py │ ├── dgtle.pyc │ ├── donews.py │ ├── economist.py │ ├── economist.pyc │ ├── ftchinese.py │ ├── ftchinese.pyc │ ├── guokr.py │ ├── huxiu.py │ ├── huxiu.pyc │ ├── ifeng.py │ ├── ifeng.pyc │ ├── ifeng_finace.py │ ├── ifeng_finace.pyc │ ├── infoQ.py │ ├── infoQ.pyc │ ├── jiqizhixin.py │ ├── jiqizhixin.pyc │ ├── lijiresearch.py │ ├── rsarxiv.py │ ├── rsarxiv.pyc │ ├── sougou.py │ ├── syncedreview.py │ ├── syncedreview.pyc │ ├── tech2opi.py │ ├── tech2opi.pyc │ ├── technode.py │ ├── techqq.py │ ├── techqq.pyc │ ├── techreview.py │ ├── techreview.pyc │ ├── techsina.py │ ├── techsina.pyc │ ├── techsohu.py │ ├── techsohu.pyc │ ├── tmtpost.py │ ├── vcbeat.py │ ├── vcbeat.pyc │ ├── xtecher.py │ ├── xtecher.pyc │ ├── zaker.py │ ├── zaker.pyc │ ├── zhidx.py │ └── zhidx.pyc ├── News_simhash ├── __init__.py ├── automatic_simhash.py ├── content_index.pkl ├── generate_simhash_index.py ├── near_duplicates.py ├── test.py └── title_index.pkl ├── News_statistics ├── news_count.json └── news_statistics.py ├── README.md ├── __init__.py ├── log ├── auto_embedding_simhash.log ├── crawler.log ├── news_count.log └── run_time.txt ├── scrapy.cfg ├── utilities.py └── 资讯来源_v1.xlsx /.gitignore: -------------------------------------------------------------------------------- 1 | /ETL/Model 2 | /News_data 3 | *.pyc 4 | -------------------------------------------------------------------------------- /Daily_crawler/daily_crawler.cron: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/Daily_crawler/daily_crawler.cron -------------------------------------------------------------------------------- /Daily_crawler/news_crawl.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/Daily_crawler/news_crawl.sh -------------------------------------------------------------------------------- /Daily_crawler/push_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/Daily_crawler/push_urls.py -------------------------------------------------------------------------------- /Daily_crawler/start_crawl.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/Daily_crawler/start_crawl.sh -------------------------------------------------------------------------------- /ETL/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ETL/auto_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/auto_embedding.py -------------------------------------------------------------------------------- /ETL/auto_embedding_simhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/auto_embedding_simhash.py -------------------------------------------------------------------------------- /ETL/model_update.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ETL/stop_words: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/stop_words -------------------------------------------------------------------------------- /ETL/train_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/train_model.py -------------------------------------------------------------------------------- /ETL/train_step1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/train_step1.py -------------------------------------------------------------------------------- /ETL/train_step2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/ETL/train_step2.py -------------------------------------------------------------------------------- /News_scrapy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /News_scrapy/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/__init__.pyc -------------------------------------------------------------------------------- /News_scrapy/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/items.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/items.py -------------------------------------------------------------------------------- /News_scrapy/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/items.pyc -------------------------------------------------------------------------------- /News_scrapy/middlewares.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/middlewares.py -------------------------------------------------------------------------------- /News_scrapy/pipelines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/pipelines.py -------------------------------------------------------------------------------- /News_scrapy/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/pipelines.pyc -------------------------------------------------------------------------------- /News_scrapy/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/settings.py -------------------------------------------------------------------------------- /News_scrapy/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/settings.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/36dsj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/36dsj.py -------------------------------------------------------------------------------- /News_scrapy/spiders/36kr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/36kr.py -------------------------------------------------------------------------------- /News_scrapy/spiders/36kr.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/36kr.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/NetEasetech.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/NetEasetech.py -------------------------------------------------------------------------------- /News_scrapy/spiders/NetEasetech.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/NetEasetech.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__init__.py -------------------------------------------------------------------------------- /News_scrapy/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__init__.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/36dsj.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/36dsj.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/36kr.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/36kr.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/NetEasetech.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/NetEasetech.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/aliresearch.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/aliresearch.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/caixin.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/caixin.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/cyzone.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/cyzone.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/dgtle.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/dgtle.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/donews.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/donews.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/economist.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/economist.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/ftchinese.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/ftchinese.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/guokr.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/guokr.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/huxiu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/huxiu.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/ifeng.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/ifeng.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/ifeng_finace.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/ifeng_finace.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/infoQ.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/infoQ.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/jiqizhixin.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/jiqizhixin.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/lijiresearch.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/lijiresearch.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/rsarxiv.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/rsarxiv.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/sougou.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/sougou.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/syncedreview.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/syncedreview.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/tech2opi.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/tech2opi.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/technode.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/technode.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/techqq.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/techqq.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/techreview.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/techreview.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/techsina.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/techsina.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/techsohu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/techsohu.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/tmtpost.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/tmtpost.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/vcbeat.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/vcbeat.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/xtecher.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/xtecher.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/zaker.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/zaker.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/__pycache__/zhidx.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/__pycache__/zhidx.cpython-36.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/aitists.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/aitists.py -------------------------------------------------------------------------------- /News_scrapy/spiders/aliresearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/aliresearch.py -------------------------------------------------------------------------------- /News_scrapy/spiders/aliresearch.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/aliresearch.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/caixin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/caixin.py -------------------------------------------------------------------------------- /News_scrapy/spiders/caixin.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/caixin.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/cyzone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/cyzone.py -------------------------------------------------------------------------------- /News_scrapy/spiders/cyzone.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/cyzone.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/dgtle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/dgtle.py -------------------------------------------------------------------------------- /News_scrapy/spiders/dgtle.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/dgtle.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/donews.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/donews.py -------------------------------------------------------------------------------- /News_scrapy/spiders/economist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/economist.py -------------------------------------------------------------------------------- /News_scrapy/spiders/economist.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/economist.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/ftchinese.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ftchinese.py -------------------------------------------------------------------------------- /News_scrapy/spiders/ftchinese.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ftchinese.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/guokr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/guokr.py -------------------------------------------------------------------------------- /News_scrapy/spiders/huxiu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/huxiu.py -------------------------------------------------------------------------------- /News_scrapy/spiders/huxiu.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/huxiu.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/ifeng.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ifeng.py -------------------------------------------------------------------------------- /News_scrapy/spiders/ifeng.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ifeng.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/ifeng_finace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ifeng_finace.py -------------------------------------------------------------------------------- /News_scrapy/spiders/ifeng_finace.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/ifeng_finace.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/infoQ.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/infoQ.py -------------------------------------------------------------------------------- /News_scrapy/spiders/infoQ.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/infoQ.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/jiqizhixin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/jiqizhixin.py -------------------------------------------------------------------------------- /News_scrapy/spiders/jiqizhixin.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/jiqizhixin.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/lijiresearch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/lijiresearch.py -------------------------------------------------------------------------------- /News_scrapy/spiders/rsarxiv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/rsarxiv.py -------------------------------------------------------------------------------- /News_scrapy/spiders/rsarxiv.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/rsarxiv.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/sougou.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/sougou.py -------------------------------------------------------------------------------- /News_scrapy/spiders/syncedreview.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/syncedreview.py -------------------------------------------------------------------------------- /News_scrapy/spiders/syncedreview.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/syncedreview.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/tech2opi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/tech2opi.py -------------------------------------------------------------------------------- /News_scrapy/spiders/tech2opi.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/tech2opi.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/technode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/technode.py -------------------------------------------------------------------------------- /News_scrapy/spiders/techqq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techqq.py -------------------------------------------------------------------------------- /News_scrapy/spiders/techqq.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techqq.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/techreview.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techreview.py -------------------------------------------------------------------------------- /News_scrapy/spiders/techreview.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techreview.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/techsina.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techsina.py -------------------------------------------------------------------------------- /News_scrapy/spiders/techsina.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techsina.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/techsohu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techsohu.py -------------------------------------------------------------------------------- /News_scrapy/spiders/techsohu.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/techsohu.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/tmtpost.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/tmtpost.py -------------------------------------------------------------------------------- /News_scrapy/spiders/vcbeat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/vcbeat.py -------------------------------------------------------------------------------- /News_scrapy/spiders/vcbeat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/vcbeat.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/xtecher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/xtecher.py -------------------------------------------------------------------------------- /News_scrapy/spiders/xtecher.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/xtecher.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/zaker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/zaker.py -------------------------------------------------------------------------------- /News_scrapy/spiders/zaker.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/zaker.pyc -------------------------------------------------------------------------------- /News_scrapy/spiders/zhidx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/zhidx.py -------------------------------------------------------------------------------- /News_scrapy/spiders/zhidx.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_scrapy/spiders/zhidx.pyc -------------------------------------------------------------------------------- /News_simhash/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /News_simhash/automatic_simhash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_simhash/automatic_simhash.py -------------------------------------------------------------------------------- /News_simhash/content_index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_simhash/content_index.pkl -------------------------------------------------------------------------------- /News_simhash/generate_simhash_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_simhash/generate_simhash_index.py -------------------------------------------------------------------------------- /News_simhash/near_duplicates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_simhash/near_duplicates.py -------------------------------------------------------------------------------- /News_simhash/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_simhash/test.py -------------------------------------------------------------------------------- /News_simhash/title_index.pkl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /News_statistics/news_count.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_statistics/news_count.json -------------------------------------------------------------------------------- /News_statistics/news_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/News_statistics/news_statistics.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/README.md -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /log/auto_embedding_simhash.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/log/auto_embedding_simhash.log -------------------------------------------------------------------------------- /log/crawler.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/log/crawler.log -------------------------------------------------------------------------------- /log/news_count.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /log/run_time.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/log/run_time.txt -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/scrapy.cfg -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/utilities.py -------------------------------------------------------------------------------- /资讯来源_v1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Johnson0722/News_scrapy_redis/HEAD/资讯来源_v1.xlsx --------------------------------------------------------------------------------