├── .gitignore
├── .idea
    ├── crawler.iml
    ├── misc.xml
    ├── modules.xml
    ├── vcs.xml
    └── workspace.xml
├── README.md
├── crawler_log
    └── log.txt
├── crawler_sys
    ├── __init__ .py
    ├── framework
    │   ├── __init__.py
    │   ├── config
    │   │   ├── high_fre.ini
    │   │   ├── key_customer.ini
    │   │   ├── legal_platforms.ini
    │   │   ├── list_page_urls.ini
    │   │   ├── redis_key_register.ini
    │   │   ├── search_keywords - 副本.ini
    │   │   └── search_keywords.ini
    │   ├── update_data_in_redis_multi_process_auto_task.py
    │   ├── update_data_in_target_releasers_single_thread.py
    │   ├── video_fields_std.py
    │   ├── write_releaserUrl_to_crawler_redis.py
    │   └── write_releasers_to_redis.py
    ├── hot_words_crawler
    │   ├── __init__.py
    │   ├── crawler_douyin.py
    │   ├── crawler_haokan.py
    │   ├── crawler_kwai.py
    │   ├── crawler_main_task.py
    │   ├── crawler_new_tudou.py
    │   ├── crawler_qq_news.py
    │   ├── crawler_toutiao.py
    │   ├── crawler_v_qq.py
    │   └── crawler_wangyi_news.py
    ├── proxy_pool
    │   └── func_get_proxy_form_kuaidaili.py
    ├── site_crawler
    │   ├── __init__.py
    │   ├── chromedriver
    │   ├── crawler.new_tudouAPP.py
    │   ├── crawler.pepper.py
    │   ├── crawler_Mango.py
    │   ├── crawler_baidu.py
    │   ├── crawler_baijiahao.py
    │   ├── crawler_bilibili.py
    │   ├── crawler_bing.py
    │   ├── crawler_douyin.py
    │   ├── crawler_haokan.py
    │   ├── crawler_iqiyi.py
    │   ├── crawler_kwai.py
    │   ├── crawler_mango.py
    │   ├── crawler_miaopai.py
    │   ├── crawler_pear.py
    │   ├── crawler_peoplevideo.py
    │   ├── crawler_shenma.py
    │   ├── crawler_sogou.py
    │   ├── crawler_tencent_news.py
    │   ├── crawler_toutiao.py
    │   ├── crawler_tudou.py
    │   ├── crawler_v_qq.py
    │   ├── crawler_v_qq_doc.py
    │   ├── crawler_wangyi_news.py
    │   ├── crawler_watermelon.py
    │   ├── crawler_weibo
    │   │   ├── cookie_pool
    │   │   ├── crawler_weibo.py
    │   │   ├── ghostdriver.log
    │   │   └── login.py
    │   ├── crawler_ximalaya.py
    │   ├── crawler_xinhua.py
    │   ├── crawler_youku.py
    │   ├── func_get_releaser_id.py
    │   └── toutiao_get_signature.py
    ├── site_crawler_by_redis
    │   ├── __init__.py
    │   ├── chromedriver
    │   ├── crawler_baijiahao.py
    │   ├── crawler_douyin.py
    │   ├── crawler_haokan.py
    │   ├── crawler_kwai.py
    │   ├── crawler_tencent_news.py
    │   ├── crawler_toutiao.py
    │   ├── crawler_tudou.py
    │   ├── crawler_v_qq.py
    │   ├── crawler_wangyi_news.py
    │   └── toutiao_article.py
    ├── site_crawler_test
    │   ├── __init__.py
    │   ├── crawler_baijiahao.py
    │   ├── crawler_douyin.py
    │   ├── crawler_haokan.py
    │   ├── crawler_kwai.py
    │   ├── crawler_tencent_news.py
    │   ├── crawler_toutiao.py
    │   ├── crawler_tudou.py
    │   ├── crawler_v_qq.py
    │   └── crawler_wangyi_news.py
    ├── special_propose
    │   ├── __init__.py
    │   ├── crawler.v_qq_comment.py
    │   ├── crawler.v_qq_danmu.py
    │   ├── eastnews.py
    │   ├── get_weibo_user_info.py
    │   ├── tocsv.py
    │   ├── watcheast.py
    │   └── whether_user_exists.py
    ├── tools
    │   ├── add_releasers_video_data.py
    │   ├── add_releasers_video_data_multi_process.py
    │   ├── add_search_keywords_to_es.py
    │   ├── add_target_releasers_by_file.py
    │   ├── count_releaser_publish_num_and_refresh_crawler_frequency.py
    │   ├── craw_data_and_write_into_alltime_index.py
    │   ├── craw_data_and_write_into_alltime_index_by_date.py
    │   ├── craw_data_and_write_into_monthly_index.py
    │   ├── craw_data_and_write_into_monthly_index_remove_duplication.py
    │   ├── craw_data_and_write_into_weekly_index.py
    │   ├── craw_data_and_write_into_weekly_index_remove_duplication.py
    │   ├── crawler_url_video_info.py
    │   ├── error.log
    │   ├── find_high_fre_releaser.py
    │   ├── match_play_count_by_video_title.py
    │   ├── match_play_count_by_video_url.py
    │   ├── ocr_by_aliyun.py
    │   ├── ocr_by_img.py
    │   ├── tudou_selenium.py
    │   ├── video_num_count.rar
    │   ├── video_num_count
    │   │   ├── .idea
    │   │   │   ├── .gitignore
    │   │   │   ├── misc.xml
    │   │   │   ├── modules.xml
    │   │   │   ├── vcs.xml
    │   │   │   └── video_num_count.iml
    │   │   ├── 2019-09-10.csv
    │   │   ├── count.csv
    │   │   ├── count_main.py
    │   │   ├── crawler
    │   │   │   ├── 2019-09-10.csv
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── crawler_tencent_news.cpython-37.pyc
    │   │   │   │   ├── crawler_wangyi_news.cpython-37.pyc
    │   │   │   │   ├── func_get_releaser_id.cpython-37.pyc
    │   │   │   │   ├── haokan_count.cpython-37.pyc
    │   │   │   │   ├── qq_video_count.cpython-37.pyc
    │   │   │   │   ├── toutiao_count.cpython-37.pyc
    │   │   │   │   ├── trans_duration_str_to_second.cpython-37.pyc
    │   │   │   │   ├── trans_strtime_to_timestamp.cpython-37.pyc
    │   │   │   │   └── tudou_count.cpython-37.pyc
    │   │   │   ├── crawler_tencent_news.py
    │   │   │   ├── crawler_wangyi_news.py
    │   │   │   ├── func_get_releaser_id.py
    │   │   │   ├── haokan_count.py
    │   │   │   ├── haokan_青春旅社.csv
    │   │   │   ├── qq_video_count.py
    │   │   │   ├── toutiao_count.py
    │   │   │   ├── trans_duration_str_to_second.py
    │   │   │   ├── trans_strtime_to_timestamp.py
    │   │   │   ├── tudou_count.py
    │   │   │   └── 首次运行.bat
    │   │   └── haokan_看看新闻Knews.csv
    │   ├── write_data_to_targert_doc.py
    │   ├── write_feihua_json_to_es.py
    │   └── 头条重复数据导出.py
    └── utils
    │   ├── __init__ .py
    │   ├── connect_with_es.py
    │   ├── connect_with_redis.py
    │   ├── date_calculator.py
    │   ├── delete_repeat_data.py
    │   ├── extract_data_from_es.py
    │   ├── fakechrome.py
    │   ├── fix_album_play_count_by_releaser_page.py
    │   ├── func_verification_code.py
    │   ├── get_toutiao_as_cp_signature.py
    │   ├── output_log.py
    │   ├── output_results.py
    │   ├── parse_bool_for_args.py
    │   ├── releaser_url_check.py
    │   ├── remove_unrelated_parameter_tudou.py
    │   ├── trans_duration_str_to_second.py
    │   ├── trans_format.py
    │   ├── trans_str_play_count_to_int.py
    │   ├── trans_strtime_to_timestamp.py
    │   ├── util_logging.py
    │   ├── write_into_database.py
    │   ├── write_into_file.py
    │   ├── write_lsturl_info_conf.py
    │   ├── write_releasers_into_ini_from_csv.py
    │   └── write_search_keyword_info_conf.py
├── dev
    ├── CCTV_live
    │   ├── bilibili.py
    │   ├── toutiao.py
    │   └── yangshipin.py
    ├── Documentary_project
    │   ├── bilibili.py
    │   ├── iqiyi.py
    │   ├── tengxun.py
    │   └── youku.py
    ├── Movies_rank_project
    │   ├── crawler_douban.py
    │   └── crawler_maoyan.py
    ├── iqiyi_debug
    │   ├── test_aiohttp.py
    │   └── test_iqiyi_list_url.py
    └── post_dict.txt
├── tasks
    ├── check_high_play_count_data_source_v_qq.py
    ├── crontab
    │   └── 192.168.18.11
    │   │   └── etc
    │   │       └── crontab
    ├── download_video_page.py
    ├── fix_album_play_count_by_getting_releaser_page.py
    ├── follower_num.csv
    ├── from_es_find_crawler_task.py
    ├── parse_list_page.py
    ├── parse_video_page.py
    ├── produce_releaser_report_from_es.py
    ├── sync_mayun_code_to_100.97.py
    ├── the_key_of_pid_in_redis.txt
    ├── update_DU_ATU_from_crawler_raw.py
    ├── write_key_releaser_to_week_doc_weekly.py
    ├── yangjingshu.csv
    ├── zhangminghui_dec.csv
    └── zhangminghui_dec2.csv
└── test
    ├── qq_news
        └── remove_unrelated_parameter.py
    └── test_read_config.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /crawler_sys/framework/check_wrong_url_daily.py
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/.idea/crawler.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
 6 |       <sourceFolder url="file://$MODULE_DIR$/crawler_sys" isTestSource="false" />
 7 |       <sourceFolder url="file://$MODULE_DIR$/crawler_sys/site_crawler" isTestSource="false" />
 8 |     </content>
 9 |     <orderEntry type="inheritedJdk" />
10 |     <orderEntry type="sourceFolder" forTests="false" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="projectConfiguration" value="pytest" />
14 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
15 |   </component>
16 | </module>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Anaconda" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/crawler.iml" filepath="$PROJECT_DIR$/.idea/crawler.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # crawler
 2 | 爬虫项目: 
 3 | 主要爬取抖音,好看,快手,头条,土豆,网易新闻,qq视频等短视频数据  
 4 | 
 5 | 爬虫主要逻辑代码在
 6 | ~~~
 7 | crawler_sys/site_crawler_by_redis
 8 | ~~~
 9 | 
10 | 调度工具
11 | ~~~
12 | crawler_sys/framework
13 | ~~~
14 | 


--------------------------------------------------------------------------------
/crawler_log/log.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_log/log.txt


--------------------------------------------------------------------------------
/crawler_sys/__init__ .py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue June 6 18:02:43 2018
4 | 
5 | @author: hanye
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:44 2018
4 | 
5 | 
6 | @author: hanye
7 | """
8 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/config/high_fre.ini:
--------------------------------------------------------------------------------
 1 | [haokan]
 2 | 看看新闻knews = https://haokan.baidu.com/haokan/wiseauthor?app_id=1565285080839434
 3 | 一手video = https://haokan.baidu.com/haokan/wiseauthor?app_id=1546617034936582
 4 | 生活欢乐汇 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462926329612
 5 | 消息直升机 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022360094300
 6 | 万物杂谈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022359827373
 7 | 横漂一姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611821585605765
 8 | 吃瓜少女萌 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822155649253
 9 | 人人视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1563996947927117
10 | 追剧小师妹 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822302181315
11 | 新娱乐萌主 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611553722549281
12 | 探剧全能王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1610928916930156
13 | 青春影剧场 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822943891552
14 | 肥仔电影圈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1601813907799169
15 | 芒果tv = https://haokan.baidu.com/haokan/wiseauthor?app_id=1549963812551792
16 | 科技观察猿 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462927568986
17 | 撩剧大师 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1607132398047134
18 | 欧阳华北 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1547961620896856
19 | 澎湃新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1574072008111128
20 | 娱乐不晚点 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022346878545
21 | 看剧小资姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594361771699213
22 | 热剧宅急送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353698233619
23 | 毒舌影视街 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195426989529
24 | 江西网络广播电视台 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1577139229517527
25 | 热剧乐翻天 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594362729953997
26 | 全球视频大魔王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1597149386529756
27 | 精彩剧集大放送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593533634618523
28 | 影视水煮鱼 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594257498968349
29 | 财经新鲜事 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1555591483551002
30 | 酷哥撩大剧 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593525911983865
31 | 咸鱼说片 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594356024003023
32 | 安徽海豚播报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611201539330357
33 | 看剧大球球 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593542564661281
34 | 长沙政法频道 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1552122607183011
35 | 体坛先锋报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022347820284
36 | 综艺杂货铺 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195671591267
37 | 视频展览馆 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195775370668
38 | 钱江视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1602058994708441
39 | 动漫铲屎官 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353045050780
40 | 荔枝新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1558731555412280
41 | 武侠超新世界 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1613833957799396


--------------------------------------------------------------------------------
/crawler_sys/framework/config/key_customer.ini:
--------------------------------------------------------------------------------
 1 | [haokan]
 2 | 经视大直播 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610033451422491
 3 | 湖北卫视资讯站 = https://haokan.hao123.com/haokan/wiseauthor?app_id=3513
 4 | 湖北经视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1573243419235544
 5 | 湖北卫视长江新闻号 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1588754674509461
 6 | 湖北卫视非正式会谈 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609570087894225
 7 | 非正式课堂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1596696077980564
 8 | 大王小湖北卫视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609563211193403
 9 | 长江云 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1604613511192710
10 | 调解面对面life = https://haokan.hao123.com/haokan/wiseauthor?app_id=1578861535741379
11 | 我为喜剧疯狂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610026230407548
12 | 湖北调解现场 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609575047119026


--------------------------------------------------------------------------------
/crawler_sys/framework/config/legal_platforms.ini:
--------------------------------------------------------------------------------
1 | [legal_platforms_to_update_production]
2 | legal_platforms = 腾讯新闻,kwai,miaopai,new_tudou,toutiao,haokan,腾讯视频,网易新闻,pearvideo,央视新闻+,人民日报,看了吗,youtube,facebook,新华社,youku,iqiyi,,bilibili,抖音,toutiao_microheadlines,toutiao_article
3 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/config/list_page_urls.ini:
--------------------------------------------------------------------------------
 1 | [new_tudou]
 2 | 推荐 = http://www.tudou.com/api/getfeeds?secCateId=10016&utdid=T8v9EQPOimUCAXL%2FAz0YrDOB&page_size=24
 3 | 乐活 = http://www.tudou.com/api/getfeeds?secCateId=10195&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 4 | 搞笑 = http://www.tudou.com/api/getfeeds?secCateId=622736331&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 5 | 影视 = http://www.tudou.com/api/getfeeds?secCateId=622769673&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 6 | 动漫 = http://www.tudou.com/api/getfeeds?secCateId=10116&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 7 | 身边 = http://www.tudou.com/api/getfeeds?secCateId=622621940&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 8 | 综娱 = http://www.tudou.com/api/getfeeds?secCateId=10198&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
 9 | 音乐 = http://www.tudou.com/api/getfeeds?secCateId=622336449&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
10 | 游戏 = http://www.tudou.com/api/getfeeds?secCateId=10051&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
11 | 
12 | [腾讯视频]
13 | 音乐 = http://v.qq.com/x/list/music
14 | 新闻 = http://v.qq.com/x/list/news
15 | 军事 = http://v.qq.com/x/list/military
16 | 娱乐 = http://v.qq.com/x/list/ent
17 | 体育 = http://v.qq.com/x/list/sports
18 | 游戏 = http://v.qq.com/x/list/games
19 | 搞笑 = http://v.qq.com/x/list/fun
20 | 时尚 = http://v.qq.com/x/list/fashion
21 | 生活 = http://v.qq.com/x/list/life
22 | 母婴 = http://v.qq.com/x/list/baby
23 | 汽车 = http://v.qq.com/x/list/auto
24 | 科技 = http://v.qq.com/x/list/tech
25 | 教育 = http://v.qq.com/x/list/education
26 | 财经 = http://v.qq.com/x/list/finance
27 | 房产 = http://v.qq.com/x/list/house
28 | 旅游 = http://v.qq.com/x/list/travel
29 | 王者荣耀 = http://v.qq.com/x/list/kings
30 | 
31 | [toutiao]
32 | #only one list page and don't rely on list page url
33 | list_url = toutiao
34 | 
35 | [iqiyi]
36 | 纪录片 = http://list.iqiyi.com/www/3/-------------4-fangyucheng-2-iqiyi--.html
37 | 游戏 = http://list.iqiyi.com/www/8/-------------4-fangyucheng-2-iqiyi--.html
38 | 资讯 = http://list.iqiyi.com/www/25/-------------4-fangyucheng-2-iqiyi-1-.html
39 | 娱乐 = http://list.iqiyi.com/www/7/-------------4-fangyucheng-2-iqiyi-1-.html
40 | 财经 = http://list.iqiyi.com/www/24/-------------4-fangyucheng-2-iqiyi--.html
41 | 片花-电影 = http://list.iqiyi.com/www/10/1007-------------4-fangyucheng-2--1-.html
42 | 片花-电视剧 = http://list.iqiyi.com/www/10/1006-------------4-fangyucheng-2--1-.html
43 | 音乐 = http://list.iqiyi.com/www/5/-------------4-fangyucheng-2-iqiyi--.html
44 | 军事 = http://list.iqiyi.com/www/28/-------------4-fangyucheng-2-iqiyi-1-.html
45 | 教育 = http://list.iqiyi.com/www/12/-------------4-fangyucheng-2-iqiyi-1-.html
46 | 体育 = http://list.iqiyi.com/www/17/-------------4-fangyucheng-2-iqiyi--.html
47 | 
48 | [youku]
49 | 全部 = http://list.youku.com/category/video/c_0_d_1_s_2_p_fangyucheng.html
50 | 音乐 = http://list.youku.com/category/show/c_95_s_5_d_1_p_fangyucheng.html
51 | 资讯 = http://list.youku.com/category/video/c_91_d_1_s_2_p_fangyucheng.html
52 | 搞笑 = http://list.youku.com/category/video/c_94_d_1_s_2_p_fangyucheng.html
53 | 生活 = http://list.youku.com/category/video/c_103_d_1_s_2_p_fangyucheng.html
54 | 汽车 = http://list.youku.com/category/video/c_104_d_1_s_2_p_fangyucheng.html
55 | 科技 = http://list.youku.com/category/video/c_105_d_1_s_2_p_fangyucheng.html
56 | 时尚 = http://list.youku.com/category/video/c_89_d_1_s_2_p_fangyucheng.html
57 | 亲子 = http://list.youku.com/category/video/c_90_d_1_s_2_p_fangyucheng.html
58 | 旅游 = http://list.youku.com/category/video/c_88_d_1_s_2_p_fangyucheng.html
59 | 微电影 = http://list.youku.com/category/video/c_171_d_1_s_2_p_2_fangyucheng.html
60 | 网剧 = http://list.youku.com/category/video/c_172_d_1_s_2_p_2_fangyucheng.html
61 | 拍客 = http://list.youku.com/category/video/c_174_d_1_s_2_p_2_fangyucheng.html
62 | 创意视频 = http://list.youku.com/category/video/c_175_d_1_s_2_p_2_fangyucheng.html
63 | 自拍 = http://list.youku.com/category/video/c_176_d_1_s_2_p_2_fangyucheng.html
64 | 广告 = http://list.youku.com/category/video/c_102_d_1_s_2_p_2_fangyucheng.html


--------------------------------------------------------------------------------
/crawler_sys/framework/config/redis_key_register.ini:
--------------------------------------------------------------------------------
1 | [v_qq]
2 | list_page_html = v_qq_list_page_html
3 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/config/search_keywords - 副本.ini:
--------------------------------------------------------------------------------
 1 | # For every keyword, there should be an search_pages value, in the same order
 2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
 3 | # space cannot be just stripped out.
 4 | 
 5 | [腾讯新闻]
 6 | keywords = 致敬中国英雄
 7 | search_pages = 20
 8 | 
 9 | [腾讯视频]
10 | keywords = 致敬中国英雄
11 | search_pages = 20
12 | 
13 | [new_tudou]
14 | keywords = 致敬中国英雄
15 | search_pages = 20
16 | 
17 | [toutiao]
18 | keywords = 致敬中国英雄
19 | search_pages = 20
20 | 
21 | [youku]
22 | keywords = 致敬中国英雄
23 | search_pages = 2
24 | 
25 | [pearvideo]
26 | keywords = 任正非 BBC
27 | search_pages = 2
28 | 
29 | [bilibili]
30 | keywords = 任正非 BBC
31 | search_pages = 2
32 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/config/search_keywords.ini:
--------------------------------------------------------------------------------
 1 | # For every keyword, there should be an search_pages value, in the same order
 2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
 3 | # space cannot be just stripped out.
 4 | 
 5 | [腾讯新闻]
 6 | keywords = 2019东方卫视春晚,2019东方卫视跨年演唱会,BesTV百视通,SMG摄界,SMG阳阳STUDIO,第一财经,第一财经《财经早班车》,东方卫视,东方卫视-东方新闻,东方卫视-看东方,东方午新闻,动感101TV,话匣子,酱紫娱乐,交叉点看,究竟视频,剧说有毒,看东方,看看新闻Knews,可凡倾听,青春旅社,上海电视台新闻综合频道,新闻坊,游戏风云gamefy,最美公路,北京时间,时间财经,时间测评,时间国际视频,时间新闻,时间新闻视频,时间直播,大揭秘,大王小王湖北卫视,非正式课堂,湖北电视台《钓鱼频道》,湖北广电纪录片部,湖北经视,湖北调解面对面,经视大直播,经视好吃佬,经视乐生活,经视人家,调解现场,问新闻,笑啦,长江新闻号,长江新闻号湖北卫视,长江云,《奇兵神犬》,《我是大侦探》,风影车都,湖南电视剧频道,湖南电视台《风影车都》,湖南电影频道,湖南都市频道,湖南广播电视台茶频道,湖南经视,湖南卫视,湖南卫视芒果云,湖南卫视天天向上,湖南娱乐,幻乐之城,金鹰卡通卫视,快乐垂钓频道,芒果V直播,芒果都市,平民英雄,亲爱的客栈,亲爱的客栈第2季,我家那小子,我是未来,我想和你唱,欲望都市show,中餐厅,中餐厅第二季,江苏卫视官方帐号,江苏卫视幸福剧场,江苏新闻,江苏新闻广播,金曲捞,荔枝新闻,南京零距离,无限歌谣季,新闻眼,缘来非诚勿扰,动历史,老板联播,梨北京,梨青岛,梨视频,梨视频ING直播,梨视频微视,梨视频游戏,一手Video,澎湃视频,澎湃新闻,第一现场,深圳卫视,深圳卫视《军情直播间》,深圳卫视《正午30分》,深圳卫视军情直播间,深圳卫视正午30分,深圳卫视直播港澳台,正午30分,直播港澳台,新京报,新京报动新闻,新京报经济新闻,新京报书评周刊,1818黄金眼,2019浙江卫视领跑演唱会,FM988浙江新闻广播,奔跑吧兄弟 第4季,大冰小将,范大姐帮忙,钱江视频,熟悉的味道 第2季,喜剧总动员,喜剧总动员 第2季,小强实验室,异口同声,浙江广播电视台,浙江经视新闻,浙江台车行天下,浙江卫视,浙江卫视《新闻深一度》,浙江新闻频道,浙江之声,中国蓝TV,中国蓝新闻,中国蓝新闻蓝媒视频,看看新闻Knews,任正非 BBC
 7 | search_pages = 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,30,30,2
 8 | 
 9 | [腾讯视频]
10 | keywords = 任正非 BBC
11 | search_pages = 2
12 | 
13 | [new_tudou]
14 | keywords = 任正非 BBC
15 | search_pages = 2
16 | 
17 | [toutiao]
18 | keywords = 任正非 BBC
19 | search_pages = 2
20 | 
21 | [youku]
22 | keywords = 任正非 BBC
23 | search_pages = 2
24 | 
25 | [pearvideo]
26 | keywords = 任正非 BBC
27 | search_pages = 2
28 | 
29 | [bilibili]
30 | keywords = 任正非 BBC
31 | search_pages = 2
32 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/update_data_in_target_releasers_single_thread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 14 17:52:02 2018
 4 | 
 5 | Find urls in given releaser page, and write first batch data into es.
 6 | Everytime this program runs, two things will happen:
 7 | 1 All video urls in given releaser page will be fetched and put into redis url pool,
 8 | 2 All data related to 1 will be fetched and stored into es.
 9 | 
10 | Data in es will be update when run this program once.
11 | 
12 | @author: hanye
13 | """
14 | 
15 | import argparse
16 | from crawler_sys.framework.platform_crawler_register import get_crawler
17 | from crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
18 | from crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
19 | 
20 | parser = argparse.ArgumentParser(description='Specify a platform name.')
21 | parser.add_argument('-p', '--platform', default=[], action='append',
22 |                     help=('Pass platform names, they will be assembled in python list.'))
23 | parser.add_argument('-n', '--max_page', default=30, type=int,
24 |                     help=('The max page numbers to be scroll for each releaser url, '
25 |                           'must be an int value, default to 30.'))
26 | parser.add_argument('-f', '--output_file_path', default='', type=str,
27 |                     help=('Specify output file path, default None.'))
28 | parser.add_argument('-r', '--push_to_redis', default='False', type=str,
29 |                     help=('Write urls to redis or not, default to True'))
30 | parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
31 |                     help=('Write data into es or not, default to True'))
32 | parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
33 |                     help=('Write data into es or not, default to True'))
34 | parser.add_argument('-l', '--releasers', default=[], action='append',
35 |                     help=('Write data into es or not, default to True'))
36 | parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
37 |                     help=('target_releasers_org or target_releasers'))
38 | args = parser.parse_args()
39 | 
40 | if args.platform != []:
41 |     platforms = args.platform
42 | else:
43 |     platforms = [
44 |         'toutiao',
45 |         '腾讯视频',
46 |         'iqiyi',
47 |         'youku',
48 |         '腾讯新闻',
49 |         'haodkan',
50 |         'new_tudou',
51 |             "kwai"
52 |         ]
53 | releaser_page_num_max = args.max_page
54 | output_f_path = args.output_file_path
55 | if output_f_path == '':
56 |     output_to_file = False
57 | else:
58 |     output_to_file = True
59 | 
60 | push_to_redis = parse_bool_for_args(args.push_to_redis)
61 | output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
62 | output_to_es_register = parse_bool_for_args(args.output_to_es_register)
63 | 
64 | releaser_Lst = args.releasers
65 | 
66 | for platform in platforms:
67 |     # 2 get releaserUrl list on each platform from target-releasers index
68 |     if not releaser_Lst:
69 |         releaserUrl_Lst = get_releaserUrls_from_es(platform=platform,target_index=args.target_index)
70 |     else:
71 |         releaserUrl_Lst = []
72 |         for releaser in releaser_Lst:
73 |             releaserUrl_Lst.extend(get_releaserUrls_from_es(platform=platform,target_index=args.target_index))
74 |     if releaserUrl_Lst == []:
75 |         print('Get empty releaserUrl_Lst for platform %s' % platform)
76 |         continue
77 |     # 3 get crawler for this platform
78 |     Platform_crawler = get_crawler(platform)
79 |     if Platform_crawler != None:
80 |         crawler_instant = Platform_crawler()
81 |     else:
82 |         print('Failed to get crawler for platform %s' % platform)
83 |         continue
84 |     # 4 for each releaserUrl, get data on the releaser page identified by this
85 |     # releaser url
86 |     for releaserUrl in releaserUrl_Lst:
87 |         crawler_instant.releaser_page(releaserUrl[0],
88 |                                       output_to_file=output_to_file,
89 |                                       filepath=output_f_path,
90 |                                       releaser_page_num_max=releaser_page_num_max,
91 |                                       output_to_es_raw=output_to_es_raw,
92 |                                       output_to_es_register=output_to_es_register,
93 |                                       push_to_redis=push_to_redis
94 |                                      )
95 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/video_fields_std.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Feb 26 17:57:38 2018
  4 | 
  5 | @author: fangyucheng
  6 | """
  7 | 
  8 | class Std_fields_video:
  9 |     def __init__(self, data_provider=None):
 10 |         if data_provider==None:
 11 |             data_provider='BDD'
 12 |         self.video_data={
 13 |                 'platform': None,
 14 |                 'channel': None,
 15 | #                'channel_url': None,
 16 | #                'channel_subdomain': None,
 17 |                 'describe': None,
 18 |                 'title': None,
 19 |                 'url': None,
 20 |                 'duration': 0,
 21 |                 'releaser': None,
 22 |                 'play_count': None,
 23 |                 'favorite_count': 0,
 24 |                 'comment_count': 0,
 25 | #                'dislike_count': None,
 26 |                 'repost_count': None,
 27 |                 'isOriginal': None,
 28 |                 'data_provider': data_provider,
 29 |                 'video_id': None,
 30 | 
 31 |                 'releaserUrl': None,
 32 |                 'release_time': 0,
 33 |                 'fetch_time': 0,
 34 |                 }
 35 | 
 36 |     def field_type_correct(self):
 37 |         def none_is_allowed(field, field_type):
 38 |             if field not in self.video_data:
 39 |                 return 'field "%s" is absent' % field
 40 |             is_correct=(isinstance(self.video_data[field], field_type)
 41 |                     or self.video_data[field]==None)
 42 |             if is_correct:
 43 |                 return True
 44 |             else:
 45 |                 return 'field "%s" should be of [%s] type or None' % (field, field_type.__name__)
 46 | 
 47 |         def none_is_not_allowed(field, field_type):
 48 |             if field not in self.video_data:
 49 |                 return 'FATAL: field "%s" is NOT FOUND!' % field
 50 |             else:
 51 |                 is_correct=isinstance(self.video_data[field], field_type)
 52 |                 if is_correct:
 53 |                     return True
 54 |                 else:
 55 |                     return 'field "%s" should be of [%s] type' % (field, field_type.__name__)
 56 | 
 57 |         platform_chk=none_is_not_allowed('platform', str)
 58 |         duration_chk=none_is_not_allowed('duration', int)
 59 |         play_count_chk=none_is_not_allowed('play_count', int)
 60 |         favorite_count_chk=none_is_allowed('favorite_count', int)
 61 |         comment_count_chk=none_is_allowed('comment_count', int)
 62 | #        dislike_count_chk=none_is_allowed('dislike_count', int)
 63 |         repost_count_chk=none_is_allowed('repost_count', int)
 64 |         isOriginal_chk=none_is_allowed('isOriginal', bool)
 65 |         release_time_chk=none_is_allowed('release_time', int)
 66 |         fetch_time_chk=none_is_not_allowed('fetch_time', int)
 67 | 
 68 |         type_chk={
 69 |             'platform': platform_chk,
 70 |             'duration': duration_chk,
 71 |             'play_count': play_count_chk,
 72 |             'favorite_count': favorite_count_chk,
 73 |             'comment_count': comment_count_chk,
 74 | #            'dislike_count': dislike_count_chk,
 75 |             'repost_count': repost_count_chk,
 76 |             'isOriginal': isOriginal_chk,
 77 |             'release_time': release_time_chk,
 78 |             'fetch_time': fetch_time_chk,
 79 |             }
 80 | 
 81 |         false_time_warning=[]
 82 |         for field in type_chk:
 83 |             if type_chk[field]!=True:
 84 |                 false_time_warning.append(type_chk[field])
 85 | 
 86 |         if false_time_warning==[]:
 87 |             return True
 88 |         else:
 89 |             warn_msg='WARNING: ' + '\n'.join(false_time_warning)
 90 |             return warn_msg
 91 | 
 92 | 
 93 | 
 94 | if __name__=='__main__':
 95 |     a=Std_fields_video()
 96 |     a.video_data['platform']='腾讯视频'
 97 |     a.video_data['duration']=62
 98 |     a.video_data['title']='This is test video'
 99 | 
100 | #    a.video_data={'duration': 62}
101 |     print(a.field_type_correct())
102 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/write_releaserUrl_to_crawler_redis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/9/11 11:48
 3 | # @Author : litao
 4 | import redis
 5 | import json
 6 | import datetime
 7 | from redis.sentinel import Sentinel
 8 | 
 9 | sentinel = Sentinel([('192.168.17.65', 26379),
10 |                      ('192.168.17.66', 26379),
11 |                      ('192.168.17.67', 26379)
12 |                      ], socket_timeout=0.5)
13 | # 查看master节点
14 | master = sentinel.discover_master('ida_redis_master')
15 | # 查看slave 节点
16 | slave = sentinel.discover_slaves('ida_redis_master')
17 | # 连接数据库
18 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=2, decode_responses=True)
19 | 
20 | 
21 | def write_project_to_redis(project):
22 |     rds.rpush("project", project)
23 | 
24 | 
25 | def write_releaserUrl_to_redis(project, data,email_dic):
26 |     project_name, duration = project.split("/")
27 |     data_dict_for_redis = {"duration": duration, "data": json.dumps(data),"email":json.dumps(email_dic)}
28 |     rds.hmset(project, data_dict_for_redis)
29 |     write_project_to_redis(project)
30 | 
31 | 
32 | def read_csv_write_into_redis(project_name, csv_file, crawler_days,email_dic=None):
33 |     try:
34 |         crawler_lis = []
35 |         with open(csv_file, 'r', encoding="gb18030")as f:
36 |             header_Lst = f.readline().strip().split(',')
37 |             for line in f:
38 |                 line_Lst = line.strip().split(',')
39 |                 line_dict = dict(zip(header_Lst, line_Lst))
40 |                 platform = line_dict['platform']
41 |                 releaserUrl = line_dict['releaserUrl']
42 |                 crawler_lis.append(platform+"&"+releaserUrl)
43 |         write_releaserUrl_to_redis("{0}/{1}".format(project_name, str(crawler_days)), crawler_lis,email_dic=email_dic)
44 |         return True
45 |     except:
46 |         return False
47 | 
48 | 
49 | def down_task():
50 |     res = rds.hgetall("task_down")
51 |     if res:
52 |         rds.delete("task_down")
53 |         return res
54 |     else:
55 |         return None
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     # 传入的email_dic 格式如下
60 |     mapping_dic = {
61 |             "taskname": "127869453",
62 |             "file_path": None,
63 |             "data_str": None,
64 |             "email_group": ["litao@csm.com.cn"],
65 |             "email_msg_body_str": "任务已完成",
66 |             "title_str": "任务已完成",
67 |             "cc_group": [],
68 |             "sender": "litao@csm.com.cn"
69 |     }
70 |     crawler_lis = []
71 |     file = r'D:\work_file\发布者账号\一次性需求附件\大客户总台垂类及主持人增加账号.csv'
72 |     read_csv_write_into_redis("task15", file, "1577808000000",email_dic=mapping_dic)
73 |     print(down_task())
74 |     # with open(file, 'r')as f:
75 |     #     header_Lst = f.readline().strip().split(',')
76 |     #     for line in f:
77 |     #         line_Lst = line.strip().split(',')
78 |     #         line_dict = dict(zip(header_Lst,line_Lst))
79 |     #         platform = line_dict['platform']
80 |     #         releaserUrl = line_dict['releaserUrl']
81 |     #         crawler_lis.append(platform+"_"+releaserUrl)
82 |     # write_releaserUrl_to_redis("project_name/1564588800000/3",crawler_lis)
83 | 


--------------------------------------------------------------------------------
/crawler_sys/framework/write_releasers_to_redis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # @Time : 2020/4/24 14:15 
  3 | # @Author : litao
  4 | # -*- coding: utf-8 -*-
  5 | """
  6 | Created on Mon May 14 17:52:02 2018
  7 | 
  8 | Find urls in given releaser page, and write first batch data into es.
  9 | Everytime this program runs, two things will happen:
 10 | 1 All video urls in given releaser page will be fetched and put into redis url pool,
 11 | 2 All data related to 1 will be fetched and stored into es.
 12 | 
 13 | Data in es will be update when run this program once.
 14 | 
 15 | @author: hanye
 16 | """
 17 | 
 18 | import sys
 19 | import argparse,copy
 20 | from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
 21 | from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
 22 | import redis,json
 23 | 
 24 | 
 25 | from redis.sentinel import Sentinel
 26 | sentinel = Sentinel([('192.168.17.65', 26379),
 27 |                      ('192.168.17.66', 26379),
 28 |                      ('192.168.17.67', 26379)
 29 |              ],socket_timeout=0.5)
 30 | # 查看master节点
 31 | master = sentinel.discover_master('ida_redis_master')
 32 | # 查看slave 节点
 33 | slave = sentinel.discover_slaves('ida_redis_master')
 34 | # 连接数据库
 35 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
 36 | # rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
 37 | 
 38 | parser = argparse.ArgumentParser(description='Specify a platform name.')
 39 | parser.add_argument('-p', '--platform', default=[], action='append',
 40 |                     help=('Pass platform names, they will be assembled in python list.'))
 41 | parser.add_argument('-pj', '--project_tags', default=[], action='append',
 42 |                     help=('Pass platform names, they will be assembled in python list.'))
 43 | parser.add_argument('-n', '--max_page', default=2, type=int,
 44 |                     help=('The max page numbers to be scroll for each releaser url, '
 45 |                           'must be an int value, default to 30.'))
 46 | parser.add_argument('-fre', '--frequency', default=1, type=int,
 47 |                     help=('choose a frequency to retrieve releaserUrl,'
 48 |                           '1, 3 or 9 is legal number, default 1'))
 49 | parser.add_argument('-proxies', '--proxies', default=0, type=int,
 50 |                     help=('Crawler proxies_num'))
 51 | parser.add_argument('-d', '--date', default=3, type=int,
 52 |                     help=('Crawler backtracking data time'))
 53 | parser.add_argument('-s', '--processes_num', default=5, type=int,
 54 |                     help=('Processes number to be used in multiprocessing'))
 55 | parser.add_argument('-article', '--article', default=0, type=int,
 56 |                     help=('is article page'))
 57 | args = parser.parse_args()
 58 | 
 59 | 
 60 | if args.platform != []:
 61 |     platforms = args.platform
 62 | else:
 63 |     print('platform must be input')
 64 |     sys.exit(0)
 65 | 
 66 | 
 67 | releaser_page_num_max = args.max_page
 68 | frequency = args.frequency
 69 | if frequency == '':
 70 |     frequency = None
 71 | 
 72 | processes_num = args.processes_num
 73 | frequency = args.frequency
 74 | print(frequency)
 75 | if frequency == 0:
 76 |     frequency = None
 77 | 
 78 | 
 79 | kwargs_dict = {
 80 |         "proxies_num": 0,
 81 |         "date":args.date,
 82 | }
 83 | if frequency:
 84 |     if frequency >= 3:
 85 |         kwargs_dict["proxies_num"] = 3
 86 | if args.proxies:
 87 |     kwargs_dict["proxies_num"] = args.proxies
 88 | is_article = args.article
 89 | 
 90 | def write_project_to_redis(platform, data):
 91 |     rds.rpush(platform, data)
 92 | 
 93 | 
 94 | def write_releaserUrl_to_redis(data_dic):
 95 |     write_project_to_redis(data_dic["platform"], json.dumps(data_dic))
 96 | 
 97 | 
 98 | for platform in platforms:
 99 |     # 2 get releaserUrl list on each platform from target-releasers index
100 |     releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers",project_tags=args.project_tags)
101 |     if is_article:
102 |         platform = platform + "_article"
103 |     rds.hset("process_num",platform,processes_num)
104 |     if releaserUrl_Lst == []:
105 | 
106 |         print('Get empty releaserUrl_Lst for platform %s' % platform)
107 |         continue
108 |     # 3 get crawler for this platform
109 |     for releaserUrl,releaser in releaserUrl_Lst:
110 |         push_dic = {
111 |                 "releaserUrl":releaserUrl,
112 |                 "releaser":releaser,
113 |                 "platform":platform,
114 |         }
115 |         push_dic.update(kwargs_dict)
116 |         write_releaserUrl_to_redis(push_dic)
117 | 


--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/2/25 15:20 
3 | # @Author : litao
4 | 


--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_main_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/3/5 15:34 
 3 | # @Author : litao
 4 | import datetime
 5 | from crawler.crawler_sys.hot_words_crawler.crawler_wangyi_news import Crawler_WangYi_News
 6 | from crawler.crawler_sys.hot_words_crawler.crawler_v_qq import Crawler_v_qq
 7 | from crawler.crawler_sys.hot_words_crawler.crawler_haokan import CrawlerHaoKan
 8 | from crawler.crawler_sys.hot_words_crawler.crawler_douyin import Crawler_douyin
 9 | from crawler.crawler_sys.hot_words_crawler.crawler_qq_news import Crawler_Qq_News
10 | from crawler.crawler_sys.hot_words_crawler.crawler_new_tudou import CrawlerNewTudou
11 | from crawler.crawler_sys.hot_words_crawler.crawler_toutiao import Crawler_toutiao
12 | from crawler.crawler_sys.hot_words_crawler.crawler_kwai import Crawler_kwai
13 | 
14 | from elasticsearch import Elasticsearch
15 | from elasticsearch.helpers import scan
16 | 
17 | es = Elasticsearch(hosts='192.168.17.11', port=80,
18 |                    http_auth=('crawler', 'XBcasfo8dgfs'))
19 | 
20 | now = int(datetime.datetime.now().timestamp() * 1e3) - 86400000
21 | platform_dic = {
22 |         "kwai": Crawler_kwai(),
23 |         "toutiao": Crawler_toutiao(),
24 |         "haokan": CrawlerHaoKan(),
25 |         "抖音": Crawler_douyin(),
26 |         "腾讯视频": Crawler_v_qq(),
27 |         "腾讯新闻": Crawler_Qq_News(),
28 |         "new_tudou": CrawlerNewTudou(),
29 |         "网易新闻": Crawler_WangYi_News()
30 | }
31 | 
32 | for platform in platform_dic:
33 |     res = platform_dic[platform].get_hot_words()
34 |     print(platform, res)
35 | 
36 | search_body = {
37 |         "query": {
38 |                 "bool": {
39 |                         "filter": [
40 |                                 # {"term":{"platform.keyword":"腾讯视频"}},
41 |                                 {"range": {"fetch_time": {"gte": now}}}
42 |                         ]
43 |                 }
44 |         }
45 | }
46 | keyword_scan = scan(client=es, index='short-video-hotwords', query=search_body, doc_type="doc")
47 | res_list = []
48 | for res in keyword_scan:
49 |     res_list.append(res["_source"])
50 | 
51 | for res in res_list:
52 |     try:
53 |         platform_dic[res["platform"]].get_hot_videos(**res)
54 |     except Exception as e:
55 |         print(res["platform"], e)
56 |         continue
57 | 


--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_new_tudou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/2/25 15:23 
 3 | # @Author : litao
 4 | import requests
 5 | import json, re, datetime,urllib
 6 | from crawler.crawler_sys.utils.output_results import retry_get_url
 7 | from crawler.crawler_sys.utils.output_results import hot_words_output_result
 8 | 
 9 | 
10 | class CrawlerNewTudou(object):
11 |     def __init__(self):
12 |         self.platform = "new_tudou"
13 |         self.headers = {
14 |                 "Cookie": "",
15 |                 "User-Agent": "Tudou;6.39.1;Android;5.1.1;OPPO R11",
16 |                 "Accept-Encoding": "gzip,deflate",
17 |                 "Connection": "close",
18 |                 "Host": "apis.tudou.com",
19 |         }
20 |     def get_hot_words(self):
21 |         bulk_list = []
22 | 
23 |         url = "https://apis.tudou.com/search/v1/hot?_t_={0}&e=md5&_s_=9a4abf3a92efad0605f8e31481327014&operator=CHINA+MOBILE_46007&network=WIFI".format(
24 |             int(datetime.datetime.now().timestamp()))
25 |         res = retry_get_url(url,proxies=3,headers=self.headers)
26 |         res_json = res.json()
27 |         for title in res_json["result"]["search"]["data"]:
28 |             dic = {
29 |                     "platform": self.platform,
30 |                     "title": title["keyword"],
31 |                     "fetch_time":int(datetime.datetime.now().timestamp()*1e3)
32 |             }
33 |             bulk_list.append(dic)
34 |         hot_words_output_result(bulk_list)
35 |         return True
36 | 
37 |     def get_hot_videos(self,title=None,max_page=10,**kwargs):
38 |         page = 1
39 |         while page <= max_page:
40 |             get_dic = {
41 |                     "keyword": title,
42 |                     # "pid": "6c23a6957198fad2",
43 |                     # "guid": "2139ff131a8a7d9ef7d3014cc8b97010",
44 |                     "mac": "",
45 |                     "imei": "null",
46 |                     "ver": "6.39.1",
47 |                     "_t_": int(datetime.datetime.now().timestamp()),
48 |                     "e": "md5",
49 |                     # "_s_": "b905d3a9738d7d2f815687428563d8f7",
50 |                     "operator": "CHINA+MOBILE_46007",
51 |                     "network": "WIFI",
52 |                     "ftype": "0",
53 |                     "cateId": "0",
54 |                     "seconds": "0",
55 |                     "seconds_end": "0",
56 |                     "ob": "",
57 |                     "pg": str(page),
58 |                     "pz": "30",
59 |                     # "aaid": "1.58259884569785E+20",
60 |                     "brand": "OPPO",
61 |                     "btype": "OPPO+R11",
62 |                     "sdkver": "2",
63 |                     "apad": "0",
64 |                     # "utdid": "XkjV9GsfBysDACyQ2%2BiF8MOw",
65 |                     "srid": "1",
66 |                     "userType": "guest",
67 |             }
68 |             requests_res = retry_get_url("https://apis.tudou.com/search/v2/integration?%s"%urllib.parse.urlencode(get_dic),headers=self.headers,proxies=3)
69 |             requests_json = requests_res.json()
70 |             page += 1
71 |             print(requests_json)
72 |             for data in requests_json["results"]["ugc"]["data"]:
73 |                 print(data)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     crawler = CrawlerNewTudou()
78 |     # crawler.get_hot_words()
79 |     crawler.get_hot_videos("范冰冰蛋糕裙")


--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_wangyi_news.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/3/2 11:07 
 3 | # @Author : litao
 4 | # -*- coding:utf-8 -*-
 5 | # @Time : 2020/2/28 12:09
 6 | # @Author : litao
 7 | 
 8 | 
 9 | import requests
10 | import json, re, datetime, urllib
11 | from crawler.crawler_sys.utils.output_results import retry_get_url
12 | from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result
13 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
14 | from write_data_into_es.func_cal_doc_id import *
15 | import base64
16 | from crawler.crawler_sys.site_crawler.crawler_wangyi_news import Crawler_wangyi_news as Crawler_wy
17 | crawler_qq_video_page = Crawler_wy().video_page
18 | 
19 | 
20 | class Crawler_WangYi_News(object):
21 |     def __init__(self):
22 |         self.platform = "网易新闻"
23 |         timestamp = int(datetime.datetime.now().timestamp() * 1e3)
24 |         self.headers = {
25 |                 "data4-Sent-Millis": str(timestamp),
26 |                 "Add-To-Queue-Millis": str(timestamp),
27 |                 "User-D": "2zx5YfHmoBb72ayxYpQVUg==",
28 |                 "User-N": "HPcUw15+Yla9nvIP1c9vbqrHfvh/PCmpfK2DVDjsFFGhp4IV17bdU7hTwNc3Kfe3gVZiNSrnIe+bsZBFoMsbZQ==",
29 |                 "httpDNSIP": "101.71.145.130",
30 |                 "User-C": "5aS05p2h",
31 |                 "User-Agent": "NewsApp/32.1 Android/5.1.1 (OPPO/OPPO R11)",
32 |                 "X-NR-Trace-Id": "%s_355730601_866174725888628" % timestamp,
33 |                 "Host": "c.m.163.com",
34 |                 "Connection": "Keep-Alive",
35 |                 "Accept-Encoding": "gzip",
36 |         }
37 | 
38 |     def get_hot_words(self):
39 |         bulk_list = []
40 |         url = "http://c.m.163.com/nc/search/hotWord.html"
41 |         page_res = retry_get_url(url, headers=self.headers, proxies=3, timeout=5)
42 |         page_json = page_res.json()
43 |         for data in page_json["hotWordList"]:
44 |             title = data["searchWord"]
45 |             if title:
46 |                 dic = {
47 |                         "platform": self.platform,
48 |                         "title": title,
49 |                         "fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
50 |                 }
51 |                 bulk_list.append(dic)
52 |         hot_words_output_result(bulk_list)
53 |         return True
54 | 
55 | 
56 |     def search_page(self,title):
57 |         data_list = []
58 |         encodestr = base64.b64encode(title.encode('utf-8'))
59 |         encodestr = str(encodestr, 'utf-8')
60 |         url = "http://c.m.163.com/search/comp2/Kg%3D%3D/20/{0}.html?".format(encodestr)
61 |         para = "deviceId=2zx5YfHmoBb72ayxYpQVUg%3D%3D&version=newsclient.32.1.android&channel=VDEzNDg2NDc5MDkxMDc%3D&canal=bmV3c19sZl9jcGFfMg%3D%3D&dtype=0&tabname=shipin&position=5YiX6KGo6aG26YOo&ts={0}&sign=Di3opZw%2FFIPDdgreSK4VCKlnMSpm6FPoel5LeY88RgZ48ErR02zJ6%2FKXOnxX046I&spever=FALSE&open=scheme_%E9%BB%98%E8%AE%A4&openpath=/video/VT5O1KVCO".format(str(int(datetime.datetime.now().timestamp())))
62 |         res = retry_get_url(url+para, headers=self.headers, timeout=5, proxies=3)
63 |         page_text = res.json()
64 |         for data in page_text["doc"]["result"]:
65 |             print(data)
66 |             data_list.append(data)
67 |         output_result(result_Lst=data_list,
68 |                       platform=self.platform,
69 |                       output_to_es_raw=True,
70 |                       )
71 |         data_list.clear()
72 |         ## sign和ts为加密字段 无法解决
73 | 
74 |     def get_hot_videos(self, max_page=10,**kwargs):
75 |         pass
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     crawler = Crawler_WangYi_News()
80 |     crawler.get_hot_words()
81 |     crawler.search_page("患者私自出院散步")
82 |     # crawler.get_hot_videos("https://v.qq.com/x/search/?q=%E6%95%99%E8%82%B2%E9%83%A8%E5%9B%9E%E5%BA%94%E6%89%A9%E5%A4%A7%E7%A1%95%E5%A3%AB%E5%92%8C%E4%B8%93%E5%8D%87%E6%9C%AC%E6%8B%9B%E7%94%9F&stag=12",channel="教育部回应扩大硕士和专升本招生")


--------------------------------------------------------------------------------
/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # @Time : 2019/9/12 10:19 
  3 | # @Author : litao
  4 | 
  5 | """
  6 |     私密代理使用示例
  7 |     接口鉴权说明：
  8 |     目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种，默认使用 "simple"鉴权。
  9 |     所有方法均可添加关键字参数sign_type修改鉴权方式。
 10 | """
 11 | import redis,random
 12 | import kdl,requests
 13 | 
 14 | from redis.sentinel import Sentinel
 15 | 
 16 | sentinel = Sentinel([('192.168.17.65', 26379),
 17 |                      ('192.168.17.66', 26379),
 18 |                      ('192.168.17.67', 26379)
 19 |                      ], socket_timeout=0.5)
 20 | # 查看master节点
 21 | master = sentinel.discover_master('ida_redis_master')
 22 | # 查看slave 节点
 23 | slave = sentinel.discover_slaves('ida_redis_master')
 24 | # 连接数据库
 25 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
 26 | # rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True)
 27 | def get_proxy_from_redis():
 28 |     try:
 29 |         one_proxy = rds.randomkey()
 30 |         username = "hanye"
 31 |         password = "i9mmu0a3"
 32 | 
 33 |         proxies = {
 34 |                 "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
 35 |                 "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
 36 |         }
 37 |         return proxies
 38 |     except Exception as e:
 39 |         print(e)
 40 |         return None
 41 | 
 42 | def func_get_proxy_to_redis():
 43 |     # chance = random.random()
 44 |     auth = kdl.Auth("997803479675913", "er2siw6i58c61s387sqljvovoz8zybaq")
 45 |     client = kdl.Client(auth)
 46 | 
 47 |     # 获取订单到期时间, 返回时间字符串
 48 |     # expire_time = client.get_order_expire_time()
 49 |     # print("expire time", expire_time)
 50 | 
 51 |     # 获取ip白名单, 返回ip列表
 52 |     # ip_whitelist = client.get_ip_whitelist()
 53 |     # print("ip whitelist", ip_whitelist)
 54 | 
 55 |     # 设置ip白名单，参数类型为字符串或列表或元组
 56 |     # 成功则返回True, 否则抛出异常
 57 |     # client.set_ip_whitelist([])
 58 |     # client.set_ip_whitelist("127.0.0.1, 192.168.0.139")
 59 |     # print(client.get_ip_whitelist())
 60 |     # client.set_ip_whitelist(tuple())
 61 | 
 62 |     # 提取私密代理ip, 第一个参数为提取的数量, 其他参数以关键字参数的形式传入(不需要传入signature和timestamp)
 63 |     # 具体有哪些参数请参考帮助中心: "https://help.kuaidaili.com/api/getdps/"
 64 |     # 返回ip列表
 65 |     # 注意：若您使用的是python2, 且在终端调用，或在文件中调用且没有加 "# -*- coding: utf-8 -*-" 的话
 66 |     # 传入area参数时，请传入unicode类型，如 area=u'北京,上海'
 67 |     # ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
 68 |     # print("dps proxy: ", ips)
 69 | 
 70 | 
 71 |     # 检测私密代理有效性： 返回 ip: true/false 组成的dict
 72 |     #ips = client.get_dps(1, sign_type='simple', format='json')
 73 |     # valids = client.check_dps_valid(ips)
 74 |     # print("valids: ", valids)
 75 | 
 76 |     # 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
 77 |     ips = client.get_dps(1, format='json',dedup=1)
 78 |     seconds = client.get_dps_valid_time(ips)
 79 |     # print("seconds: ", seconds)
 80 |     for key in seconds:
 81 |         rds.set(key, key, ex=int(seconds[key]) - 3)
 82 | 
 83 |     # 获取计数版ip余额（仅私密代理计数版）
 84 |     # balance = client.get_ip_balance(sign_type='hmacsha1')
 85 |     # print("balance: ", balance)
 86 | def proxy_test(proxies):
 87 |     page_url = "http://dev.kdlapi.com/testproxy/"
 88 |     headers = {
 89 |             "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
 90 |     }
 91 | 
 92 |     res = requests.get(url=page_url, proxies=proxies, headers=headers)
 93 |     # print(res.status_code)  # 获取Reponse的返回码
 94 |     if res.status_code == 200:
 95 |         print(res.content.decode('utf-8'))  # 获取页面内容
 96 | 
 97 | def get_proxy_dic(max_proxies=None):
 98 |     if not max_proxies:
 99 |         max_proxies = 8
100 |     try:
101 |         res = rds.dbsize()
102 |     except Exception as e:
103 |         print("redis error")
104 |         return None
105 |     if res is None:
106 |         return None
107 |     if res < max_proxies:
108 |         func_get_proxy_to_redis()
109 |         return get_proxy_from_redis()
110 |     else:
111 |         return get_proxy_from_redis()
112 | 
113 | def get_proxy(proxies_num=None):
114 |     if proxies_num:
115 |         proxies = get_proxy_dic(max_proxies=proxies_num)
116 |         # print("get a IP %s" % str(proxies))
117 |         return proxies
118 |     else:
119 |         return None
120 | 
121 | if __name__ == "__main__":
122 |     proxy_pool_dic = get_proxy(11)
123 |     print(proxy_pool_dic)
124 |     proxy_test(proxy_pool_dic)
125 |     print(get_proxy_from_redis())


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:43 2018
4 | 
5 | @author: hanye
6 | """
7 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/chromedriver


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler.new_tudouAPP.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr 10 17:52:08 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import datetime
10 | import json
11 | import requests
12 | from bs4 import BeautifulSoup
13 | 
14 | def get_video(target):
15 |     result = []
16 |     count = 0
17 |     while len(result) < target and count < 100:
18 |         listurl = 'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622736331'
19 |         get_page = requests.get(listurl)
20 |         get_page.encoding = 'utf-8'
21 |         page = get_page.text
22 |         print('get one page')
23 |         page = page.replace('true', 'True')
24 |         page = page.replace('false', 'False')
25 |         page_dic = json.loads(page)['entity']
26 |         for line in page_dic:
27 |             midstep = line['detail']
28 |             title = midstep['base_detail']['title']
29 |             playcount = midstep['video_detail']['vv_desc']
30 |             releaser = midstep['user_detail']['name']
31 |             releaserid = midstep['user_detail']['id']
32 |             videoid = midstep['video_detail']['video_id']
33 |             duration = midstep['video_detail']['duration']
34 |             url = 'http://new-play.tudou.com/v/'+videoid
35 |             get_page = requests.get(url)
36 |             page = get_page.text
37 |             soup = BeautifulSoup(page, 'html.parser')
38 |             rt_step1 = soup.find('div', {'class':'td-play__videoinfo__details-box__time'})
39 |             rt_step2 = rt_step1.text[:-2]
40 |             release_time = int(datetime.datetime.strptime(rt_step2,
41 |                                                           '%Y-%m-%d %H:%M:%S').timestamp()*1e3)
42 |             D0 = {"title":title, "releaser":releaser, "release_time":release_time,
43 |                   "duration":duration, 'releaserid':releaserid, 'playcount':playcount}
44 |             if D0 not in result:
45 |                 result.append(D0)
46 |                 print('added one video')
47 |             else:
48 |                 count += 1
49 |                 print('repetition')
50 |     return result
51 | 
52 | 
53 | 
54 | if __name__=='__main__':
55 |     try1 = get_video(target=200)
56 |     #{'旅行':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10293',
57 |     #'科技':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10199',
58 |     #'娱乐':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622726317',
59 |     #'萌物':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622485153'}
60 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler.pepper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr 10 11:06:53 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import requests
 9 | import datetime
10 | 
11 | 
12 | 
13 | class crawler_pepper():
14 |     
15 |     def except_followers(self):
16 |         rank=[]
17 |         urls=['http://webh.huajiao.com/rank/recv?&type=day','http://webh.huajiao.com/rank/recv?&type=week','http://webh.huajiao.com/rank/recv?&type=all','http://webh.huajiao.com/rank/sun?&type=day','http://webh.huajiao.com/rank/sun?&type=week','http://webh.huajiao.com/rank/sun?&type=all','http://webh.huajiao.com/rank/user?&type=day','http://webh.huajiao.com/rank/user?&type=all','http://webh.huajiao.com/rank/send?&type=day','http://webh.huajiao.com/rank/send?&type=week','http://webh.huajiao.com/rank/send?&type=all']
18 |         for url in urls:
19 |             get_page=requests.get(url)
20 |             get_page.encoding='utf-8'
21 |             page = get_page.text
22 |             print(url)
23 |             page_dic = eval(page)['data']['rank']
24 |             one_rank=[]
25 |             for one_video in page_dic:
26 |                 category=url[29:33]
27 |                 timespan=url[-3:]
28 |                 name=one_video['nickname']
29 |                 authorlevel=one_video['authorlevel']
30 |                 level=one_video['level']
31 |                 try:
32 |                     score=one_video['score']
33 |                 except KeyError: 
34 |                     score=None
35 |                 uid=one_video['uid']
36 |                 currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3
37 |                 D0={'name':name,'level':level,'authorlevel':authorlevel,'score':score,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime}
38 |                 one_rank.append(D0)
39 |             rank.append(one_rank)
40 |         return rank
41 |     
42 |     def get_followers(self):
43 |         rank=[]
44 |         urls=['http://webh.huajiao.com/rank/followers?&type=day','http://webh.huajiao.com/rank/followers?&type=all']
45 |         for url in urls:
46 |             get_page=requests.get(url)
47 |             get_page.encoding='utf-8'
48 |             page = get_page.text
49 |             print(url)
50 |             midstep = eval(page)
51 |             page_dic = midstep['data']['rank']
52 |             one_rank=[]
53 |             for one_video in page_dic:
54 |                 category='followers'
55 |                 timespan=url[-3:]
56 |                 name=one_video['nickname']
57 |                 authorlevel=one_video['authorlevel']
58 |                 level=one_video['level']
59 |                 followers=one_video['followers']
60 |                 uid=one_video['uid']
61 |                 currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3
62 |                 D0={'name':name,'level':level,'authorlevel':authorlevel,'followers':followers,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime}
63 |                 one_rank.append(D0)
64 |             rank.append(one_rank)
65 |         return rank
66 |     
67 |     
68 | if __name__=='__main__':
69 |     ttt =crawler_pepper()
70 |     followers=ttt.get_followers() 
71 |     others=ttt.except_followers()


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_baidu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Aug 21 16:58:37 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
12 | 
13 | 
14 | headers = {'Host': 'www.baidu.com',
15 |            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
16 |            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 |            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
18 |            'Accept-Encoding': 'gzip, deflate, br',
19 |            'Cookie': 'BAIDUID=5EBFCC8E193341115A4A3C71960B63E7:FG=1; BIDUPSID=BD339F6B0442001D2528C4BFBCE098DB; PSTM=1500974423; BDUSS=RCY0lFRmJ4MDlMMU5xfkp4NWU3bUlTckJOZU03ZTB4UHdJbUpUeWlVZmhlT3haSVFBQUFBJCQAAAAAAAAAAAEAAABM1D8MZmFuZzExMDExNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOHrxFnh68RZc3; MCITY=-131%3A; BD_UPN=1352; H_PS_PSSID=; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=1; BD_CK_SAM=1; PSINO=2; BDRCVFR[gltLrB7qNCt]=mk3SLVN4HKm; pgv_pvi=525595648; pgv_si=s2288931840; Hm_lvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; Hm_lpvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; BD_HOME=1; sug=3; sugstore=0; ORIGIN=0; bdime=21110; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDSVRTM=225; BDRCVFR[C0p6oIjvx-c]=I67x6TjHwwYf0; BAIDUPH=tn=Â§rn=Â§ct=0',
20 |            'Connection': 'keep-alive',
21 |            'Upgrade-Insecure-Requests': '1',
22 |            'Cache-Control': 'max-age=0'}
23 | 
24 | 
25 | def baidu_info_page(keyword, max_page_num):
26 |     result_lst = []
27 |     for page_num in range(0, max_page_num):
28 |         search_url = ('https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd='+keyword+
29 |                       '&x_bfe_rqs=03E80&tngroupname=organic_news&pn='+str(page_num*10))
30 |         get_page = requests.get(search_url, headers=headers)
31 |         get_page.encoding = 'utf-8'
32 |         page = get_page.text
33 |         soup = BeautifulSoup(page, 'html.parser')
34 |         info_lst = soup.find_all('div', {'class': 'result'})
35 |         print_page_num = page_num+1
36 |         for line in info_lst:
37 |             title = line.h3.a.text
38 |             title = title.replace('\n', '')
39 |             url = line.h3.a['href']
40 |             source_and_release_time = line.find('p', {'class': 'c-author'}).text
41 |             source_and_release_time_lst = source_and_release_time.split('\xa0')
42 |             source = source_and_release_time_lst[0]
43 |             release_time_str = source_and_release_time_lst[-1]
44 |             release_time = trans_strtime_to_timestamp(release_time_str)
45 |             midstep_content = line.find('div', {'class': 'c-summary'}).text
46 |             content = midstep_content.replace(source, '').replace(' ', '')
47 |             content = content.replace('\xa0', '')
48 |             source = source.replace('\n', '').replace('\t', '')
49 |             content = content.replace('\n', '').replace('\t', '')
50 |             whole_page = line.find('a', {'class': 'c-cache'})['href']
51 |             fast_open_whole_page = whole_page + '&fast=y'
52 |             get_whole_page = requests.get(fast_open_whole_page, headers=headers)
53 |             get_whole_page.encoding = 'gb18030'
54 |             whole_page_html = get_whole_page.text
55 |             fetch_time = int(time.time()*1000)
56 |             info_dic = {'title': title,
57 |                         'url': url,
58 |                         'source': source,
59 |                         'content': content,
60 |                         'release_time': release_time,
61 |                         'keyword': keyword,
62 |                         'whole_page_html': whole_page_html,
63 |                         'fetch_time': fetch_time}
64 |             result_lst.append(info_dic)
65 |             print('get data at page %s' % print_page_num)
66 |     return result_lst
67 | 
68 | if __name__ == '__main__':
69 |     keyword = '中超'
70 |     test_data = baidu_info_page(keyword, max_page_num=10)


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_bing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Aug 22 09:30:20 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import time
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
13 | 
14 | 
15 | def bing_page(keyword, max_page_num):
16 |     result_lst = []
17 |     for page_num in range(0, max_page_num):
18 |         search_url = ('https://cn.bing.com/search?q=' + keyword + '&pc=MOZI&first=' 
19 |                       + str(max_page_num*10) + '&FORM=PERE1')
20 |         get_page = requests.get(search_url)
21 |         get_page.encoding = 'utf-8'
22 |         page = get_page.text
23 |         soup = BeautifulSoup(page, 'html.parser')
24 |         info_lst = soup.find_all('li', {'class': 'b_algo'})
25 |         print_page_num = page_num+1
26 |         for line in info_lst:
27 |             title = line.h2.a.text
28 |             title = title.replace('\n', '')
29 |             url = line.h2.a['href']
30 |             release_time_and_content = line.find('p').text
31 |             release_time_and_content_lst = release_time_and_content.split('\u2002·\u2002')
32 |             content = release_time_and_content_lst[-1]
33 |             release_time_str = release_time_and_content_lst[0]
34 |             release_time = trans_strtime_to_timestamp(release_time_str)
35 |             get_whole_page_str = line.find('div', {'class': 'b_attribution'})['u']
36 |             get_whole_page_lst = get_whole_page_str.split('|')
37 |             d_number = get_whole_page_lst[2]
38 |             w_number = get_whole_page_lst[3]
39 |             get_whole_page_url = ('http://cncc.bingj.com/cache.aspx?q=' + keyword +
40 |                                   '&d=' + d_number + '&mkt=zh-CN&setlang=zh-CN&w='
41 |                                   + w_number)
42 |             get_whole_page = requests.get(get_whole_page_url)
43 |             whole_page_html = get_whole_page.text
44 |             fetch_time = int(time.time()*1000)
45 |             info_dic = {'title': title,
46 |                         'url': url,
47 |                         'content': content,
48 |                         'release_time': release_time,
49 |                         'keyword': keyword,
50 |                         'whole_page_html': whole_page_html,
51 |                         'fetch_time': fetch_time}
52 |             result_lst.append(info_dic)
53 |             print('get data at page %s' % print_page_num)
54 |     return result_lst
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     keyword = '中超'
59 |     test_data = bing_page(keyword, max_page_num=10)


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_miaopai.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Sep 28 10:28:45 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import urllib
10 | import requests
11 | import json
12 | from bs4 import BeautifulSoup
13 | from crawler_sys.utils.output_results import retry_get_url
14 | from crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
15 | from crawler.crawler_sys.utils.util_logging import logged
16 | try:
17 |     from .func_get_releaser_id import *
18 | except:
19 |     from func_get_releaser_id import *
20 | 
21 | class Crawler_miaopai():
22 | 
23 |     def get_releaser_follower_num(self, releaserUrl):
24 |         if "www.yixia.com" in releaserUrl:
25 |             get_page = retry_get_url(releaserUrl)
26 |             get_page.encoding = 'utf-8'
27 |             page = get_page.text
28 |             soup = BeautifulSoup(page, 'html.parser')
29 |             try:
30 |                 midstep_1 = soup.find('ul', {'class': 'bottomInfor'})
31 |                 midstep_2 = midstep_1.find_all('li')
32 |                 for line in midstep_2:
33 |                     line_text = line.text
34 |                     if '粉丝' in line_text:
35 |                         follower_str = line_text.replace('粉丝', '')
36 |                         follower_num = trans_play_count(follower_str)
37 |                 print('%s follower number is %s' % (releaserUrl, follower_num))
38 |                 return follower_num
39 |             except:
40 |                 print("can't can followers")
41 |         elif "n.miaopai.com" in releaserUrl:
42 |             try:
43 |                 split_url = releaserUrl.split("personal/")
44 |                 suid = split_url[-1].replace('.htm', '').replace('.html', '').replace('htm', '')
45 |                 url = "https://n.miaopai.com/api/aj_user/space.json?suid=%s" % suid
46 |                 get_page = urllib.request.urlopen(url)
47 |                 page_bytes = get_page.read()
48 |                 page_str = page_bytes.decode("utf-8")
49 |                 page_dic = json.loads(page_str)
50 |                 follower_num = page_dic['data']['followers_count']
51 |                 return follower_num
52 |             except:
53 |                 print("can't can followers")
54 | 
55 | 
56 |     def get_releaser_id(self, releaserUrl):
57 |         return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
58 | 
59 | 
60 | #encoding method
61 |     @logged
62 |     def releaser_page(self, releaserUrl, releaser_page_num_max=30):
63 |         headers = {'Host': 'n.miaopai.com',
64 |                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
65 |                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 |                    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
67 |                    'Accept-Encoding': 'gzip, deflate, br',
68 |                    'Connection': 'keep-alive',
69 |                    'Cookie': 'aliyungf_tc=AQAAAIVvfVl0CgQAysVBfBViNUJYGG5C; Hm_lvt_e8fa5926bca558076246d7fb7ca12071=1545124849; Hm_lpvt_e8fa5926bca558076246d7fb7ca12071=1545124849',
70 |                    'Upgrade-Insecure-Requests': '1',
71 |                    'Cache-Control': 'max-age=0'}
72 |         releaser_id = self.get_releaser_id(releaserUrl)
73 |         page_num = 1
74 |         while page_num <= releaser_page_num_max:
75 |             url = ('https://n.miaopai.com/api/aj_user/medias.json?suid=%s&page=%s'
76 |                    % (releaser_id, page_num))
77 |             get_page = requests.get(url, headers=headers)
78 |             get_page.encoding = 'utf-8'
79 |             page = get_page.text
80 |             page_dic = get_page.json()
81 | 
82 | if __name__ == "__main__":
83 |     releaserUrl = 'http://n.miaopai.com/personal/h~NjA~vSfoYLz1pchtm'
84 |     test = Crawler_miaopai()
85 |     p = test.get_releaser_follower_num(releaserUrl)
86 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_peoplevideo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 19 10:32:52 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | import json
12 | 
13 | 
14 | #两会：lh/_cl/0/30/
15 | #访谈：ft/_cl/6/30/
16 | #资讯：zx/_cl/29/20/
17 | #视点：sd/_cl/29/20/
18 | #全球：qq/_cl/7/30/
19 | 
20 | class people_video():
21 |    
22 |     def video_page(self,url):
23 |         get_page=requests.get(url)
24 |         get_page.encoding='utf-8'
25 |         page = get_page.text
26 |         midstep1=json.loads(page)
27 |         midstep2=midstep1['data']['article']
28 |         midstep3=midstep2['publish']
29 |         title=midstep2['title']
30 |         author=midstep2['author']
31 |         release_time=midstep2['publishTime']
32 |         playcount=midstep2['playNum']
33 |         dura=midstep2['duration']
34 |         duration_str=dura
35 |         dl=duration_str.split(':')
36 |         dl_int=[]
37 |         for v in dl:
38 |             v=int(v)    
39 |             dl_int.append(v) 
40 |         if len(dl_int) == 2:
41 |             duration=dl_int[0]*60+dl_int[1]
42 |         else:
43 |             duration=dl_int[0]*3660+dl_int[1]*60+dl_int[2]
44 |         releaser=midstep3['name']
45 |         D0={'title':title,'playcount':playcount,'releaser':releaser,'release_time':release_time,'duration':duration,'author':author,'url':url}
46 |         return D0   
47 |     
48 |     def list_page(self,partofurl,totalpage):
49 |         urls=['http://mobilevideo.people.com.cn/movie_pub/News/publishfile/'+partofurl+'list_{}.json'.format(str(i)) for i in range(1,totalpage)]
50 |         list_page=[]
51 |         for url in urls:
52 |             get_page=requests.get(url)
53 |             get_page.encoding='utf-8'
54 |             page=get_page.text
55 |             soup=BeautifulSoup(page,'html.parser')
56 |             try:
57 |                 selection=soup.html.head.title.text
58 |                 print ('no more page')
59 |             except AttributeError:
60 |                 print(url)
61 |                 midstep1=json.loads(page)
62 |                 midstep2=midstep1['data']['newsList']
63 |                 for one_line in midstep2:
64 |                     url=one_line['articleLink']
65 |                     one_video_dic=self.video_page(url)
66 |                     list_page.append(one_video_dic)
67 |         return list_page
68 |             
69 |     
70 |     
71 | if __name__=='__main__':
72 |     people_crawler = people_video()
73 |     #video_page=people_video.video_page(url='http://mobilevideo.people.com.cn/movie_pub/News/publishfile/spk/_cd/10/18/4154954.json')
74 |     list_page2=people_crawler.list_page(partofurl="qq/_cl/7/30/",totalpage=20)
75 |     #search_page=iqiyi_crawler.search_page(keyword="国家相册")
76 |     


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_sogou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Aug 22 09:30:20 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import time
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
13 | 
14 | 
15 | cookie = ('YYID=2FFBDAA6D4FBA37438F4067C8123E98B; IMEVER=8.5.0.1322;'
16 |           'SUID=3D03FF723865860A59795A5F000BB71F;'
17 |           'SUV=00C039A172FF033D5993ADBD770E7410; usid=lF0F7il0yWbXF5c9;'
18 |           'IPLOC=CN1100; sct=11; SMYUV=1512954490386200;'
19 |           'ad=19fxxkllll2zKxvnlllllVHr6$UllllltsDRlyllll9llllljgDll5@@@@@@@@@@;'
20 |           'SNUID=D0DE5A671A1E68C31FB628911B8277A5; wuid=AAGPcSphIAAAAAqLE2OSTQgAGwY=;'
21 |           'UM_distinctid=16449b02797449-0c5d9293f4a833-143f7040-1fa400-16449b02799881;'
22 |           'CXID=794EC592A14CE76F5DF3F3A3BDDDD787;'
23 |           'ld=Kyllllllll2bWX10QTIdJOHDsvSbWX1uK94Vhkllll9lllllVklll5@@@@@@@@@@;'
24 |           'cd=1534754086&17502a3f56c02f72dfd43a17cbb19663;'
25 |           'rd=Vyllllllll2bBEqoQLWCNCHfKv2bWX1uzX0atkllllwllllRVllll5@@@@@@@@@@;'
26 |           'LSTMV=173%2C72; LCLKINT=1570')
27 | 
28 | 
29 | headers = {'Host': 'news.sogou.com',
30 |            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
31 |            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
32 |            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
33 |            'Accept-Encoding': 'gzip, deflate',
34 |            'Cookie': cookie,
35 |            'Connection': 'keep-alive',
36 |            'Upgrade-Insecure-Requests': '1',
37 |            'Cache-Control': 'max-age=0'}
38 | 
39 | 
40 | def sogou_info_page(keyword):
41 |     result_lst = []
42 |     for page_num in range(1,11):
43 |         search_url = 'http://news.sogou.com/news?&query='+keyword+'&page='+str(page_num)
44 |         get_page = requests.get(search_url, headers=headers)
45 |         page = get_page.text
46 |         soup = BeautifulSoup(page, 'html.parser')
47 |         news_lst = soup.find_all('div', {'class': 'vrwrap'})
48 |         for line in news_lst:
49 |             try:
50 |                 title = line.div.h3.a.text
51 |                 url = line.div.h3.a['href']
52 |                 source_and_release_time = line.find('p', {'class': 'news-from'}).text
53 |                 source_and_release_time_lst = source_and_release_time.split('\xa0')
54 |                 source = source_and_release_time_lst[0]
55 |                 release_time_str = source_and_release_time_lst[-1]
56 |                 release_time = trans_strtime_to_timestamp(release_time_str)
57 |                 try:
58 |                     content = line.find('span').text
59 |                 except:
60 |                     print('no content at %s' % title)
61 |                     content = 'missing'
62 |                 fetch_time = int(time.time()*1000)
63 |                 try:
64 |                     similar_news = line.find('a', {'id': 'news_similar'}).text
65 |                 except:
66 |                     print('no similar news at %s' % title)
67 |                     similar_news = 'missing'
68 |                 news_info = {'title': title,
69 |                              'url': url,
70 |                              'source': source,
71 |                              'release_time': release_time,
72 |                              'fetch_time': fetch_time,
73 |                              'content': content,
74 |                              'similar_news': similar_news,
75 |                              'keyword': keyword}
76 |                 result_lst.append(news_info)
77 |                 print('get data at page %s' % page_num)
78 |             except:
79 |                 ('the error occured at position %s' % news_lst.index(line))
80 |     return result_lst
81 | 
82 | 
83 | if __name__=='__main__':
84 |     keyword = '中超'
85 |     test_sogou = sogou_info_page(keyword)
86 |     
87 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_watermelon.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May 28 10:29:57 2018
  4 | 
  5 | @author: fangyucheng
  6 | """
  7 | 
  8 | 
  9 | 
 10 | 
 11 | import requests
 12 | import json
 13 | import datetime
 14 | import re
 15 | from framework.video_fields_std import Std_fields_video
 16 | #from . import bulk_write_into_es
 17 | import js2py
 18 | import hashlib
 19 | import time
 20 | from selenium import webdriver
 21 | 
 22 | 
 23 | class Crawler_Watermelon(Std_fields_video):
 24 | 
 25 |     def write_into_file(self, data_dict, file_obj):
 26 |         json_str=json.dumps(data_dict)
 27 |         file_obj.write(json_str)
 28 |         file_obj.write('\n')
 29 |         file_obj.flush()  
 30 |         
 31 |         
 32 |     def feed_url_into_redis(self, dict_Lst):
 33 |         pass
 34 |     
 35 |     
 36 |     def output_result(self, result_Lst, output_to_file=False, filepath=None):
 37 |         # write data into es crawler-raw index
 38 |         #bulk_write_into_es(result_Lst)
 39 | 
 40 |         # feed url into redis 
 41 |         self.feed_url_into_redis(result_Lst)
 42 |         
 43 |         # output into file according to passed in parameters
 44 |         if output_to_file==True and filepath!=None:
 45 |             output_fn='crawler_watermelon_%s_json' % datetime.datetime.now().isoformat()[:10]
 46 |             output_f=open(filepath+'/'+output_fn, 'a', encoding='utf-8')
 47 |             self.write_into_file(result_Lst, output_f)
 48 |         else:
 49 |             pass
 50 |     
 51 |     
 52 |     def get_list_video(self,output_to_file=False, filepath=None):
 53 |         result_Lst = []
 54 |         max_behot_time = 0
 55 |         count = 0
 56 |         
 57 |         headers = {'Host': 'ic.snssdk.com',
 58 |                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
 59 |                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 60 |                    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
 61 |                    'Accept-Encoding': 'gzip, deflate',
 62 |                    'Cookie': 'odin_tt=5b54e47f71b1963502fe03c4028f5672c887a0b739ce2302481beda2a4388a0a538ade820b54b4589da13d18dde9d245',
 63 |                    'Connection': 'keep-alive',
 64 |                    'Upgrade-Insecure-Requests': '1',
 65 |                    'Cache-Control': 'max-age=0'}
 66 |         
 67 |         
 68 |         while count <= 0:
 69 |             time_now = int(time.time())
 70 |             listurl = 'http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&max_behot_time='+str(max_behot_time)+'&list_entrance=main_tab&last_refresh_sub_entrance_interval='+str(time_now)
 71 |                        #http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&list_entrance=main_tab&last_refresh_sub_entrance_interval=1527473360&loc_mode=5&tt_from=refresh_auto&play_param=codec_type%3A0&iid=33815381012&device_id=52965120460&ac=wifi&channel=wandoujia&aid=32&app_name=video_article&version_code=653&version_name=6.5.3&device_platform=android&ab_version=359940%2C344692%2C353539%2C356329%2C361439%2C324397%2C361311%2C358091%2C358364%2C356602%2C350431%2C354439%2C325211%2C346575%2C342302%2C361530%2C320651%2C361551&ssmix=a&device_type=MuMu&device_brand=Android&language=zh&os_api=19&os_version=4.4.4&uuid=008796749793280&openudid=54767d8bf41ac9a4&manifest_version_code=253&resolution=1280*720&dpi=240&update_version_code=65307&_rticket=1527473360674&rom_version=cancro-eng+4.4.4+V417IR+eng.root.20180201.174500+release-keys&fp=i2T_FYmuPzL5Fl4ZcrU1FYFeL2FW
 72 |             
 73 |             get_page = requests.get(listurl,headers=headers)
 74 |             page = get_page.text
 75 |             page = page.replace('true','True')
 76 |             page = page.replace('false','False')
 77 |             page = page.replace('null','"Null"')
 78 |             page_dic = eval(page)
 79 |             video_agg = page_dic['data']
 80 |             count += 1
 81 |             for line in video_agg:
 82 |                 try:
 83 |                     video_str=line['content']
 84 |                     video_dic=eval(video_str)
 85 |                     if video_dic['has_video']==True:
 86 |                         title = video_dic['title']
 87 |                         url = video_dic['display_url']
 88 |                         browser = webdriver.Chrome()
 89 |                         browser.get(url)
 90 |                         pc_midstep = browser.find_element_by_class_name('num').text
 91 |                         play_count = ' '.join(re.findall('\d+',pc_midstep))
 92 | 
 93 |                         release_time = int(video_dic['publish_time']*1e3)
 94 |                         play_count2 = video_dic['read_count']
 95 |                         releaser = video_dic['media_name']
 96 |                         max_behot_time = video_dic['behot_time']
 97 |                         video_id = video_dic['item_id']
 98 |                         releaser_id = video_dic['user_info']['user_id']
 99 |                         fetch_time = int(datetime.datetime.now().timestamp()*1e3)
100 |                         
101 |                         D0={'title':title,'url':url,'release_time':release_time,'releaser':releaser,'play_count':play_count,
102 |                             'video_id':video_id,'releaser_id':releaser_id,'fetch_time':fetch_time,'play_count2':play_count2}
103 |                             
104 |                         result_Lst.append(D0)
105 |                         print ('get one video')
106 |                 except:
107 |                     pass
108 |             browser.close()
109 |         self.output_result(result_Lst,output_to_file=output_to_file,filepath=filepath)
110 |         return result_Lst
111 |             #result_Lst.clear()
112 | 
113 | if __name__=='__main__':
114 |     test=Crawler_Watermelon()
115 |     output_to_file = True
116 |     filepath = 'D:/CSM3.0/爬虫结果/watermelon'
117 |     gogogo = test.get_list_video(output_to_file,filepath)


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/cookie_pool:
--------------------------------------------------------------------------------
1 | ['_T_WM=9ff1a76df3fb08baf991c83c0ecfdc93', 'SSOLoginState=1565062981', 'SUHB=0S7yXlcJlzydaX', 'SCF=AhdzjhXNYWCBvPKwVWrXOk-g-JBS9ojbzjFzZ-ppbfmgNLRAwcuir4WiJVxmdGuC4EPxXqTVzYsP2sVEbUHXojg.', 'SUB=_2A25wTIcVDeRhGeFM7lcY8ivEwzyIHXVTzildrDV6PUJbkdANLUjikW1NQN3cyJOFrdgYFfLOWkaG5qHFYk_I9rk1']
2 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/ghostdriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/crawler_weibo/ghostdriver.log


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/8/6 10:47 
 3 | # @Author : litao
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support.ui import WebDriverWait
 7 | from selenium.webdriver.support import expected_conditions as EC
 8 | import sys,os
 9 | 
10 | class WeiboLogin():
11 |     def __init__(self, username, password):
12 |         # os.system('pkill -f phantom')
13 |         chrome_options = webdriver.ChromeOptions()
14 |         # chrome_options.add_argument('--headless')
15 |         # chrome_options.add_argument('--disable-gpu')
16 |         #      driver = webdriver.Remote(command_executor='http://192.168.18.11:4444/wd/hub',
17 |         # desired_capabilities=DesiredCapabilities.CHROME)
18 |         self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://weibo.cn/'
19 |         self.browser = webdriver.Chrome(r'../chromedriver.exe', options=chrome_options)
20 |         #self.browser.set_window_size(1050, 840)
21 |         self.wait = WebDriverWait(self.browser, 20)
22 |         self.username = username
23 |         self.password = password
24 | 
25 |     def open(self):
26 |         """
27 |         打开网页输入用户名密码并点击
28 |         :return: None
29 |         """
30 |         self.browser.get(self.url)
31 |         username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
32 |         password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
33 |         submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
34 |         username.send_keys(self.username)
35 |         password.send_keys(self.password)
36 |         submit.click()
37 | 
38 |     def run(self):
39 |         """
40 |         破解入口
41 |         :return:
42 |         """
43 |         self.open()
44 |         WebDriverWait(self.browser, 30).until(
45 |             EC.title_is('我的首页')
46 |         )
47 |         cookies = self.browser.get_cookies()
48 |         cookie = [item["name"] + "=" + item["value"] for item in cookies]
49 |         cookie_str = '; '.join(item for item in cookie)
50 |         self.browser.quit()
51 |         return str(cookie_str)
52 | 
53 | if __name__ == '__main__':
54 |     user_name = '13910233534'
55 |     password = 'Lemo1995'
56 |     cookie_str = WeiboLogin(user_name, password).run()
57 |     print(cookie_str,type(cookie_str))
58 |     with open("./cookie_pool","a",encoding="utf-8") as f:
59 |         f.write(cookie_str+"\n")
60 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_xinhua.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Dec  6 21:50:08 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import requests
10 | 
11 | url = 'https://xhpfmapi.zhongguowangshi.com/v500/core/indexlist'
12 | 
13 | headers = {"Accept-Encoding": "*",
14 |            "Content-Type": "application/json; charset=UTF-8",
15 |            "Content-Length": "870",
16 |            "Host": "xhpfmapi.zhongguowangshi.com",
17 |            "Connection": "Keep-Alive",
18 |            "User-Agent": "okhttp/3.10.0"}
19 | 
20 | 
21 | post_dict = ('{"param":"vPCnptKLNQB9Bldt193BqMWKnVNcVmaQ4pcJ6k8iV5wmESJSidV85U3rbdOK'
22 |              'e+jmLQUDdj8Ca9QuYHizMT6l7Vq3CKOBgtNieEbRQihk10GoWKkS+L9BWI9M '
23 |              'd7Iq3IT/HL4saSsu6fkEe3PYjuA6EXKGPF6EThfdaQ3CyGPP+KhVFGp3C1Fk '
24 |              '9U50pbC0HflUGTK1iH7U7A9ZKNjhNd07/U49uddfqZ8OFbZjBw4mwRij0tGP '
25 |              'F5jXpIBKoNsSnpWNTQnL86VCLd7+9jmQ+PzCkdmEiYNqRPYzXv+ihAhvBNY9 '
26 |              'yg/18dVE1+zwHR685iDwabVLWppSxacQJr7iz1uuc6O0hoIWiUBkhDcUsD8S '
27 |              'O3/DXZ8PgLMEvagO4TfS4AqMy/n5rW5UwzYr+x8jKDpr0kNjrUUH2vvXlQr6 '
28 |              '+TfqwS5qHm6+nGCLoyv7HyMoOmvSraDmg3OD66rGTol/Ri9NYlSrNpyYemGG '
29 |              'fAuUuZ1pIzNeTzpF02TvzF2OQ8T2iROkmUK8iSXAuViE4I+KNKq959APFJ6Y '
30 |              'Fs+i7nfZ0d2/5jonslonNYfHqAtotAuciwEW+fqy9aZpHMRQaU5XjUeVV0Sk '
31 |              'dKiY2SveT59VTiOBgi8fy+q096BYfC+vPBp780AE0A8UnpgWz6LdyyP4GR0J '
32 |              'JTVN6F4TiPnD5mA7Lr4fpUPlVtNt8BfGKS/AkIGD+BaLVxArmlbC/6A9/caI '
33 |              'ldYWwg4yIu+CLjkDtif2NUwsqgfbgfbj/pbhPZGVzC+KjCP382OmQHKM4HoO '
34 |              'tnJueIoPIb14EwBSz98qmJ6tMBJa2BxsSVbKV076QBE7qNiJF6ZFBimSwob8 '
35 |              'upM="}')
36 | 
37 | get_page = requests.post(url, data=post_dict, headers=headers, verify=False)
38 | 
39 | page = get_page.json()


--------------------------------------------------------------------------------
/crawler_sys/site_crawler/toutiao_get_signature.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/4/10 12:05 
 3 | # @Author : litao
 4 | import time
 5 | import hashlib
 6 | # import execjs
 7 | 
 8 | # def get_js(id):
 9 | #     # f = open(r"D:\work_file\test\index.js", 'r', encoding='UTF-8')
10 | #     f = open(r"D:\work_file\work_file\crawler\crawler_sys\site_crawler\toutiao_test_js.js", 'r', encoding='UTF-8')
11 | #     line = f.readline()
12 | #     htmlstr = ''
13 | #     while line:
14 | #         htmlstr = htmlstr + line
15 | #         line = f.readline()
16 | #     ctx = execjs.compile(str(htmlstr))
17 | #     return ctx.call('get_as_cp_signature',id)
18 | 
19 | def getHoney():  # 根据JS脚本破解as ,cp
20 |     t = int(time.time())  # 获取当前时间
21 |     # t=1534389637
22 |     # print(t)
23 |     e = str('%X' % t)  # 格式化时间
24 |     # print(e)
25 |     m1 = hashlib.md5()  # MD5加密
26 |     m1.update(str(t).encode(encoding='utf-8'))  # 转化格式
27 |     i = str(m1.hexdigest()).upper()  # 转化大写
28 |     # print(i)
29 |     n = i[0:5]  # 获取前5位字符
30 |     a = i[-5:]  # 获取后5位字符
31 |     s = ''
32 |     r = ''
33 |     for x in range(0, 5):  # 交叉组合字符
34 |         s += n[x] + e[x]
35 |         r += e[x + 3] + a[x]
36 |     eas = 'A1' + s + e[-3:]
37 |     ecp = e[0:3] + r + 'E1'
38 |     # print(eas)
39 |     # print(ecp)
40 |     return eas, ecp
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/20 13:52 
3 | # @Author : litao
4 | 


--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler_by_redis/chromedriver


--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/toutiao_article.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/15 15:11 
 3 | # @Author : litao
 4 | from crawler.crawler_sys.site_crawler_by_redis.crawler_toutiao import Crawler_toutiao
 5 | 
 6 | 
 7 | class Crawler_toutiao_article(Crawler_toutiao):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def releaser_page_by_time(self, start_time=None, end_time=None, url=None, allow=None, **kwargs):
12 |         count_false = 0
13 |         for res in self.article_page(url, proxies_num=kwargs.get("proxies_num")):
14 |             video_time = res["release_time"]
15 |             # print(res)
16 |             if video_time:
17 |                 if start_time < video_time:
18 |                     if video_time < end_time:
19 |                         yield res
20 |                 else:
21 |                     count_false += 1
22 |                     if count_false > allow:
23 |                         break
24 |                     else:
25 |                         yield res
26 |         count_false = 0
27 |         for res in self.microheadlines_page(url, proxies_num=kwargs.get("proxies_num")):
28 |             video_time = res["release_time"]
29 |             print(video_time)
30 |             if video_time:
31 |                 if start_time < video_time:
32 |                     if video_time < end_time:
33 |                         yield res
34 |                 else:
35 |                     count_false += 1
36 |                     if count_false > allow:
37 |                         break
38 |                     else:
39 |                         yield res
40 | 
41 | if __name__ == "__main__":
42 |     test = Crawler_toutiao_article()
43 |     for a in test.article_page("https://www.toutiao.com/c/user/5821222208/#mid=5821222208"):
44 |         print(a)


--------------------------------------------------------------------------------
/crawler_sys/site_crawler_test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/20 13:52 
3 | # @Author : litao
4 | 


--------------------------------------------------------------------------------
/crawler_sys/special_propose/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:43 2018
4 | 
5 | @author: hanye
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/crawler_sys/special_propose/crawler.v_qq_comment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 27 15:09:29 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import urllib.request
 9 | 
10 | def get_comment(last):
11 |     comment_lst=[]
12 |     while (type(last)!=bool):
13 |         url = "http://coral.qq.com/article/"+str(targetid)+"/comment/v2?callback=_article"+str(targetid)+"commentv2&oriorder=o&pageflag=1&cursor="+str(last)
14 |         headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
15 |         opener = urllib.request.build_opener()
16 |         opener.addheaders = [headers]
17 |         data = opener.open(url).read()
18 |         page=str(data,encoding='utf-8')
19 |         useful_infor=page.split('commentv2')[1]
20 |         str_to_dic=useful_infor[1:-1]
21 |         str_to_dic=str_to_dic.replace('true','True')
22 |         str_to_dic=str_to_dic.replace('false','False')
23 |         dic_all_infor=eval(str_to_dic)
24 |         ttt=dic_all_infor['data']
25 |         last=ttt['last']
26 |         print(last)
27 |         repcomment=ttt['repCommList']
28 |         if type(repcomment)==list:
29 |             repcomment_lst=repcomment
30 |             print('repcomment')
31 |         elif type(repcomment)==dict:
32 |             list(repcomment.values())
33 |             print('repcomment')
34 |         else:
35 |             repcomment=None
36 |         if repcomment!=None:
37 |             for yyy in repcomment_lst:
38 |                 if type(yyy)==list:
39 |                     for uu in yyy:
40 |                        content=uu['content']
41 |                        parent=uu['parent']
42 |                        publishdate=uu['time']
43 |                        userid=uu['userid']
44 |                        upcount=uu['up']
45 |                        contentid=uu['id']
46 |                        dadorson=2
47 |                        D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson}
48 |                        comment_lst.append(D2)
49 |                 else:
50 |                    content=uu['content']
51 |                    parent=uu['parent']
52 |                    publishdate=uu['time']
53 |                    userid=uu['userid']
54 |                    upcount=uu['up']
55 |                    contentid=uu['id']
56 |                    dadorson=2
57 |                    D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson}
58 |                    comment_lst.append(D2)
59 |         else:
60 |             print('no repcomment')
61 |         comment=ttt['oriCommList']
62 |         for zzz in comment:
63 |             content=zzz['content']
64 |             contentid=zzz['id']
65 |             upcount=zzz['up']
66 |             publishdate=zzz['time']
67 |             userid=zzz['userid']
68 |             parent=contentid
69 |             dadorson=1
70 |             D0={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'contentid':contentid,'parent':parent,'dadorson':dadorson}
71 |             comment_lst.append(D0)
72 |     print('get all comment')
73 |     return comment_lst
74 |     
75 | if __name__=='__main__':
76 |     last=0
77 |     targetid=2426229062
78 |     comment=get_comment(last)


--------------------------------------------------------------------------------
/crawler_sys/special_propose/crawler.v_qq_danmu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 26 13:50:00 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import requests
 9 | 
10 | 
11 | def danmu(x,jyid,targetid):
12 |     danmu_lst=[]
13 |     pagenum_lst=[]
14 |     for i in range(0,x):
15 |         ttt=15*(1+i)
16 |         pagenum_lst.append(ttt)
17 |     for pagenum in pagenum_lst:
18 |         try:
19 |             url='https://mfm.video.qq.com/danmu?otype=json&callback=jQuery'+jyid+'&timestamp='+str(pagenum)+'&target_id='+targetid+'&count=500&second_count=6&session_key=0%2C0%2C0'
20 |             get_page=requests.get(url)
21 |             get_page.encoding='utf-8'
22 |             page=get_page.text
23 |             length=len(jyid)+7
24 |             prepage=page[length:-1]
25 |             prepage=prepage.replace('\r','')
26 |             prepage=prepage.replace('/n','')
27 |             dicdicdic=eval(prepage)
28 |             danmu_count=dicdicdic['count']
29 |             if danmu_count>1000:
30 |                 print(danmu_count)
31 |             print(pagenum)
32 |             print('get one page')
33 |             get_danmu_lst=dicdicdic['comments']
34 |             for danmu in get_danmu_lst:
35 |                 commentid=danmu['commentid']
36 |                 content=danmu['content']
37 |                 timepoint=danmu['timepoint']
38 |                 upcount=danmu['upcount']
39 |                 opername=danmu['opername']
40 |                 D0={'commentid':commentid,'content':content,'timepoint':timepoint,'upcount':upcount,'opername':opername}
41 |                 danmu_lst.append(D0)
42 |         except SyntaxError:
43 |             print(str(pagenum)+'there is sth wrong')
44 |     return danmu_lst
45 | 
46 | if __name__=='__main__':
47 |     x=int((1*3600+31*60+28)/15)
48 |     jyid='19103025125001255282_1522399545358'
49 |     targetid='2434347230'
50 |     video_data777=danmu(x,jyid,targetid)
51 | #one    
52 | #1 33 13
53 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104770781474841782_1522044823510&timestamp=15&target_id=2431410170&count=80&second_count=6&session_key=0%2C0%2C0&_=1522044823541
54 | 
55 | #two
56 | #1 31 28
57 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910006379066561103097_1522048580301&timestamp=15&target_id=2432862868&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048580319
58 | 
59 | #three
60 | #1 31 28            
61 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104977942731832877_1522048936005&timestamp=45&target_id=2434347230&count=80&second_count=6&session_key=178328%2C326%2C1522048940&_=1522048936017           
62 | 
63 | #four
64 | #1 31 12
65 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191007914957214696439_1522048988303&timestamp=15&target_id=2464055709&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048988318
66 | 
67 | #five
68 | #1 34 47
69 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19109570751887462264_1522049153524&timestamp=45&target_id=2479936974&count=80&second_count=6&session_key=95212%2C150%2C1522049156&_=1522049153539
70 | 
71 | #six
72 | #1 31 27
73 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910025632186610303198_1522050281547&timestamp=135&target_id=2497027899&count=80&second_count=6&session_key=135654%2C180%2C1522050285&_=1522050281568
74 | 
75 | #seven
76 | #1 31 22
77 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191022889623011170301_1522050238126&timestamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522050238141
78 | 
79 | 
80 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19105157512767429676_1522391911686&timestamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522391911700
81 | 
82 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19103025125001255282_1522399545358&timestamp=105&target_id=2434347230&count=80&second_count=6&session_key=186386%2C332%2C1522399582&_=1522399545371
83 | 


--------------------------------------------------------------------------------
/crawler_sys/special_propose/eastnews.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar 13 15:21:47 2018
  4 | 
  5 | @author: fangyucheng
  6 | """
  7 | 
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | import re
 11 | import datetime
 12 | import pickle
 13 | import pandas as pd 
 14 | 
 15 | class Crawler_v_qq_eastnews:
 16 |     
 17 |     def video_page(self, url):
 18 |         get_page=requests.get(url)
 19 |         get_page.encoding='utf-8'
 20 |         page = get_page.text
 21 |         soup = BeautifulSoup(page,'html.parser')     
 22 |         try:
 23 |             title=soup.find('h1',{'class':'video_title _video_title'}).text
 24 |             title=title.replace('\n','')
 25 |             title=title.replace('\t','')
 26 |         except AttributeError:
 27 |             title=None
 28 |         try:
 29 |             releaser=soup.find('span',{'class':'user_name'}).text
 30 |         except:
 31 |             releaser=None
 32 |         try:
 33 |             releaserUrl=soup.find('a',{'class':'user_info'})['href']
 34 |         except TypeError:
 35 |             releaserUrl=None
 36 |         try:
 37 |             video_intro=soup.find('meta',{'itemprop':'description'})['content']
 38 |         except TypeError:
 39 |             video_intro=None
 40 |         try:
 41 |             midstep = soup.find("script",{"r-notemplate":"true"}).text
 42 |             try:
 43 |                 duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1]
 44 |             except IndexError:
 45 |                 duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1]
 46 |                 duration=duration.replace('"','')
 47 |                 duration=int(duration)
 48 |             except:
 49 |                 print('Catched exception, didn\'t find duartion in var VIDEO_INFO')
 50 |                 duration=0
 51 |             try:
 52 |                 playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1]
 53 |             except:
 54 |                 print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO')
 55 |                 playcount=0
 56 |             retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0]
 57 |             try:
 58 |                 release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3)
 59 |             except ValueError: 
 60 |                 release_time=0
 61 |         except:
 62 |             duration=None
 63 |             playcount=None
 64 |             release_time=None
 65 |         D0={'title':title,'playcount':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl}
 66 |         return D0  
 67 |      
 68 |     def search_page(self,totalpage):
 69 |         video_Lst=[]
 70 |         url_Lst=[]
 71 |         page_Lst=['https://v.qq.com/x/search/?ses=qid%3D_5hveCy5oWKS_b5d4GuLquXTO29F8LJnLcmNDpNkXFkeEr8UDB0g9g%26last_query%3D%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB%26tabid_list%3D0%7C11%7C8%7C7%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%96%B0%E9%97%BB%7C%E5%8E%9F%E5%88%9B%7C%E5%85%B6%E4%BB%96&q=%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB&stag=3&cur={}&cxt=tabid%3D0%26sort%3D1%26pubfilter%3D0%26duration%3D3'.format(str(i)) for i in range(1,totalpage)]
 72 |         for page_url in page_Lst:
 73 |             get_page=requests.get(page_url)
 74 |             print (page_url)
 75 |             get_page.encoding='utf-8'
 76 |             page = get_page.text
 77 |             soup = BeautifulSoup(page,'html.parser')
 78 |             tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" })
 79 |             for data_line in tencent:
 80 |                 try:
 81 |                     ttt=data_line.find('span',{'title':'东方新闻'}).text
 82 |                 except AttributeError:
 83 |                     ttt=None
 84 |                 if ttt==None:
 85 |                     urls=None
 86 |                 else:
 87 |                     urls=data_line.h2.a['href']
 88 |                     get_page=requests.get(urls)
 89 |                     print (urls)
 90 |                     get_page.encoding='utf-8'
 91 |                     page = get_page.text
 92 |                     soup = BeautifulSoup(page,'html.parser')
 93 |                     fff=soup.find_all('a',{'class':'figure_detail'})      
 94 |                     for zzz in fff:
 95 |                         urls1=zzz['href']
 96 |                         urls2='https://v.qq.com'+urls1
 97 |                         url_Lst.append(urls2)
 98 |         for url in url_Lst:
 99 |             dicdic = self.video_page(url)
100 |             dicdic['url']=url
101 |             print(url)
102 |             video_Lst.append(dicdic)
103 |         return video_Lst
104 |   
105 | 
106 | if __name__=='__main__':
107 |     v_qq_crawler = Crawler_v_qq_eastnews()
108 |     search_page2=v_qq_crawler.search_page(totalpage=20)   


--------------------------------------------------------------------------------
/crawler_sys/special_propose/get_weibo_user_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu May 31 09:20:25 2018
 4 | 
 5 | 龟速爬取 需要提速
 6 | 
 7 | @author: fangyucheng
 8 | """
 9 | 
10 | 
11 | from selenium import webdriver
12 | import re
13 | import json
14 | import time
15 | 
16 | 
17 | 
18 | class Crawler_Get_Weibo_User_Info():
19 |     
20 |     
21 |     def from_file_to_list(self,filename):
22 |         openfile = open(filename)
23 |         task = []
24 |         for line in openfile:
25 |             line_dic = line.replace('\n','')
26 |             task.append(line_dic)
27 |         return task
28 |     
29 |     
30 |     
31 |     def get_user_info(self,filename,resultname):
32 |         result = open(resultname,'a')
33 |         result_lst = []
34 |         task_lst = self.from_file_to_list(filename)
35 |         browser = webdriver.Chrome()
36 |         for url in task_lst:
37 |             try:
38 |                 browser.get(url)
39 |                 browser.maximize_window()
40 |                 time.sleep(8)
41 |                 user_name = browser.find_element_by_class_name('username').text
42 |                 print('get user_name')
43 |                 user_info = browser.find_element_by_class_name('info').text
44 |                 print('get user_info')
45 |                 try:
46 |                     industry_detail_intro = browser.find_element_by_class_name('ul_detail').text
47 |                     pattern1 = '行业类别'
48 |                     pattern2 = '简介'
49 |                     pattern3 = '毕业于'
50 |                     try:
51 |                         industry = ' '.join(re.findall('行业类别.*',industry_detail_intro)).replace(pattern1,'').replace(' ','')
52 |                         print('get industry')
53 |                     except:
54 |                         industry = None
55 |                     try:
56 |                         detail_intro = ' '.join(re.findall('简介.*',industry_detail_intro)).replace(pattern2,'').replace(' ','').replace('：','')
57 |                         print('get detail_intro')
58 |                     except:
59 |                         detail_intro = None
60 |                     try:
61 |                         graduated_from = ' '.join(re.findall('毕业于.*',industry_detail_intro)).replace(pattern3,'').replace(' ','')
62 |                         print('get graduated_from')
63 |                     except:
64 |                         graduated_from = None
65 |                 except:
66 |                     pass
67 |                 followers_fans = browser.find_element_by_class_name('tb_counter').text.split('\n')
68 |                 followers = followers_fans[0]
69 |                 print('get followers')
70 |                 fans = followers_fans[2]
71 |                 print('get fans')
72 |                 weibo_num = followers_fans[4]
73 |                 print('get weibo_num')
74 |                 D0 = {'user_name':user_name,
75 |                       'user_info':user_info,
76 |                       'industry':industry,
77 |                       'detail_intro':detail_intro,
78 |                       'followers':followers,
79 |                       'fans':fans,
80 |                       'weibo_num':weibo_num,
81 |                       'url':url,
82 |                       'graduated_from':graduated_from}
83 |                 print('get one user')
84 |                 result_lst.append(D0)
85 |                 json_D0 = json.dumps(D0)
86 |                 result.write(json_D0)
87 |                 result.write('\n')
88 |                 result.flush()
89 |             except:
90 |                 pass
91 |         return result_lst
92 |     
93 |     
94 | 
95 | if __name__=='__main__':
96 |     test = Crawler_Get_Weibo_User_Info()
97 |     filename='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_list_try.txt'
98 |     resultname='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_info_20180531_2'
99 |     firstV3 = test.get_user_info(filename,resultname)


--------------------------------------------------------------------------------
/crawler_sys/special_propose/tocsv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Mar 28 01:12:09 2018
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | 
 9 | import pandas as pd
10 | ttt=pd.DataFrame(comment)
11 | ttt['heiheihei']="'"
12 | ttt['id']=ttt['heiheihei']+ttt['contentid']
13 | ttt['real_time']=pd.to_datetime(ttt['publishdate'],unit='s')
14 | try:
15 |     ttt.to_csv('wuwuwu.csv',encoding='utf-8',index=False)
16 | except UnicodeEncodeError:
17 |     pass
18 | 


--------------------------------------------------------------------------------
/crawler_sys/special_propose/whether_user_exists.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 15 09:49:15 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | from selenium import webdriver
 9 | from crawler_sys.utils.Metaorphosis import Meta
10 | import time
11 | import json
12 | 
13 |         
14 | def open_url(filename,resultname):
15 |     result_file = open(resultname,'a')
16 |     result_lst = []
17 |     browser = webdriver.Chrome()
18 |     browser.maximize_window()
19 |     time.sleep(60)
20 |     meta = Meta()
21 |     url_lst = meta.str_file_to_lst(filename)
22 |     for url in url_lst:
23 |         browser.get(url)
24 |         time.sleep(6)
25 |         try:
26 |             user_name = browser.find_element_by_class_name('username').text
27 |         except:
28 |             user_name = None
29 |         D0 = {'url':url,'user_name':user_name}
30 |         json_D0 = json.dumps(D0)
31 |         result_file.write(json_D0)
32 |         result_file.write('\n')
33 |         result_file.flush()
34 |         result_lst.append(D0)
35 |     return result_lst
36 |         
37 |     
38 | if __name__=='__main__':
39 |     filename = 'D:/CSM3.0/打杂/whether_exists.txt'
40 |     resultname = 'D:/CSM3.0/打杂/whether_exists'
41 |     result=open_url(filename,resultname)
42 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/add_releasers_video_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Sep  6 09:22:24 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
10 | from crawler.crawler_sys.utils import trans_format
11 | 
12 | def get_target_releaser_video_info(file_name,
13 |                                    output_to_es_raw=True,
14 |                                    es_index=None,
15 |                                    doc_type=None,
16 |                                    releaser_page_num_max=10000):
17 |     start_time = int(time.time()*1e3)
18 |     task_lst = trans_format.csv_to_lst_with_headline(file_name)
19 |     for line in task_lst:
20 |         releaserUrl = line['releaserUrl']
21 |         platform = line['platform']
22 |         crawler = get_crawler(platform=platform)
23 |         crawler_initialization = crawler()
24 |         if platform == 'haokan':
25 |             try:
26 |                 crawler_initialization.releaser_page(releaserUrl=releaserUrl,
27 |                                                      releaser_page_num_max=releaser_page_num_max,
28 |                                                      output_to_es_raw=True,
29 |                                                      es_index=es_index,
30 |                                                      doc_type=doc_type,
31 |                                                      fetchFavoriteCommnt=False)
32 |             except:
33 |                 print(releaserUrl)
34 |         else:
35 |             try:
36 |                 crawler_initialization.releaser_page(releaserUrl=releaserUrl,
37 |                                                      releaser_page_num_max=releaser_page_num_max,
38 |                                                      output_to_es_raw=True,
39 |                                                      es_index=es_index,
40 |                                                      doc_type=doc_type)
41 |             except:
42 |                 print(releaserUrl)
43 |     end_time = int(time.time()*1e3)
44 |     time_info = [start_time, end_time]
45 |     return time_info
46 | 
47 | if __name__ =='__main__':
48 |     get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv',
49 |                                               releaser_page_num_max=1000,
50 |                                               es_index='crawler-data-raw',
51 |                                               doc_type='doc'
52 |                                               )


--------------------------------------------------------------------------------
/crawler_sys/tools/add_releasers_video_data_multi_process.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Sep  6 09:22:24 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | from multiprocessing import Pool
10 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
11 | from crawler.crawler_sys.utils import trans_format
12 | from elasticsearch import Elasticsearch
13 | from elasticsearch.helpers import scan
14 | 
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user = 'zhouyujiang'
18 | passwd = '8tM9JDN2LVxM'
19 | http_auth = (user, passwd)
20 | 
21 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
22 | 
23 | def func_search_reUrl_from_target_index(platform, releaser):
24 |     search_body = {
25 |             "query": {
26 |                 "bool": {
27 |                   "filter": [
28 |                     {"term": {"platform.keyword": platform}},
29 |                     {"term": {"releaser.keyword": releaser}}
30 |                     ]
31 |                 }
32 |                   }
33 |                    }
34 |     search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
35 |     if search_re['hits']['total'] > 0:
36 |         return search_re['hits']['hits'][0]['_source']['releaserUrl']
37 |     else:
38 |         print('Can not found:', platform, releaser)
39 |         return None
40 | 
41 | def get_target_releaser_video_info(file_name,
42 |                                    output_to_es_raw=True,
43 |                                    es_index=None,
44 |                                    doc_type=None,
45 |                                    releaser_page_num_max=10000):
46 |     start_time = int(time.time()*1e3)
47 |     task_lst = trans_format.csv_to_lst_with_headline(file_name)
48 |     pool = Pool(10)
49 |     arg_dict = {"releaser_page_num_max": releaser_page_num_max,
50 |                 "output_to_es_raw": True,
51 |                 "es_index": es_index,
52 |                 "doc_type": doc_type}
53 |     for line in task_lst:
54 |         platform = line['platform']
55 |         releaser = line['releaser']
56 |         try:
57 |             releaserUrl = line["releaserUrl"]
58 |         except:
59 |             releaserUrl = func_search_reUrl_from_target_index(platform, releaser)
60 |         print("releaserUrl",releaserUrl)
61 |         crawler_initialization = get_crawler(platform=platform)
62 |         try:
63 |             crawler = crawler_initialization().search_page
64 |             pool.apply_async(crawler, args=(releaserUrl, ), kwds=arg_dict)
65 |         except:
66 |             continue
67 |     pool.close()
68 |     pool.join()
69 |     end_time = int(time.time()*1e3)
70 |     time_info = [start_time, end_time]
71 |     return time_info
72 | 
73 | if __name__ =='__main__':
74 |     get_time = get_target_releaser_video_info(file_name=r'C:\Users\litao\Desktop\target_releasers - key_custom.csv',
75 |                                               releaser_page_num_max=300,
76 |                                               es_index='crawler-data-raw',
77 |                                               doc_type='doc')


--------------------------------------------------------------------------------
/crawler_sys/tools/add_search_keywords_to_es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/7/19 11:29
 3 | # @Author : litao
 4 | 
 5 | import json
 6 | import datetime
 7 | import elasticsearch
 8 | import hashlib
 9 | import csv
10 | hosts = '192.168.17.11'
11 | port = 80
12 | user = 'zhouyujiang'
13 | passwd = '8tM9JDN2LVxM'
14 | http_auth = (user, passwd)
15 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
16 | 
17 | 
18 | def write_es(Lst):
19 |     count = 0
20 |     bulk_all_body = ""
21 |     doc_id_type = "all-time-url"
22 |     header_Lst = Lst[0]
23 |     linec = 1
24 |     sha1 = hashlib.sha1()
25 |     for line in Lst:
26 |         if linec == 1:
27 |             linec += 1
28 |             continue
29 |         linec += 1
30 |         print(linec)
31 |         line_dict = dict(zip(header_Lst, line))
32 |         dic = {
33 |                 "title": line_dict["title"],
34 |                 "timestamp": int(datetime.datetime.now().timestamp() * 1e3),
35 |                 "platform": line_dict["platform"],
36 |                 "page": line_dict["page"],
37 |         }
38 | 
39 |         sha1.update((line_dict["title"]+line_dict["platform"]).encode("utf8"))
40 |         bulk_head = '{"index": {"_id":"%s"}}' % sha1.hexdigest()
41 |         data_str = json.dumps(dic, ensure_ascii=False)
42 |         bulk_one_body = bulk_head + '\n' + data_str + '\n'
43 |         bulk_all_body += bulk_one_body
44 |         count += 1
45 |         if count % 500 == 0:
46 |             eror_dic = es.bulk(index=target_index, doc_type=target_type,
47 |                                body=bulk_all_body, request_timeout=500)
48 |             bulk_all_body = ''
49 |             if eror_dic['errors'] is True:
50 |                 print(eror_dic['items'])
51 |                 print(bulk_all_body)
52 |             print(count)
53 | 
54 |     if bulk_all_body != '':
55 |         eror_dic = es.bulk(body=bulk_all_body,
56 |                            index=target_index,
57 |                            doc_type=target_type,
58 |                            request_timeout=500)
59 |         if eror_dic['errors'] is True:
60 |             print(eror_dic)
61 |             bulk_all_body = ''
62 |             # print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     target_index = 'search_keywords'
67 |     target_type = 'doc'
68 | 
69 |     m3 = open(r"D:\work_file\发布者账号\一次性需求附件\keywords.csv", "r", encoding="gb18030")
70 |     file = csv.reader(m3)
71 |     data = list(file)
72 |     write_es(data)
73 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/add_target_releasers_by_file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jun 13 11:57:40 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import elasticsearch
 9 | import json
10 | import time
11 | from crawler_sys.utils.releaser_url_check import test_releaserUrl
12 | from crawler_sys.utils import trans_format
13 | 
14 | 
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user_id = 'fangyucheng'
18 | password = 'VK0FkWf1fV8f'
19 | http_auth = (user_id, password)
20 | lose_re_url = []
21 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
22 | 
23 | test_lst = trans_format.csv_to_lst_with_headline('F:/add_target_releaser/album_playcnt/album_playcnt_002.csv')
24 | task_lst = []
25 | 
26 | for line in test_lst:
27 |     if line['releaserUrl'] is not None:
28 |         task_lst.append(line)
29 | 
30 | bulk_all_body = ''
31 | 
32 | poster = 'fangyucheng'
33 | test_re = test_releaserUrl(task_lst)
34 | 
35 | for one_re in test_re:
36 |     if  one_re['True_or_False'] == 1:
37 |         line_dic = {}
38 |         post_by = poster
39 |         post_time = int(time.time() * 1000)
40 |         timestamp = int(time.time() * 1000)
41 |         releaserUrl = one_re['releaserUrl']
42 |         platform = one_re['platform']
43 |         releaser = one_re['releaser']
44 |         try:
45 |             album_play_count = one_re['album_play_count']
46 |         except:
47 |             album_play_count = None
48 |         _id = platform + '_' + releaser
49 | 
50 |         bulk_head = '{"index": {"_id":"%s"}}' % _id
51 |         line_dic['is_valid'] = True
52 |         line_dic['platform'] = platform
53 |         line_dic['post_by'] = post_by
54 |         if album_play_count is not None:
55 |             line_dic['album_play_count'] = album_play_count
56 |         line_dic['post_time'] = post_time
57 |         line_dic['releaser'] = releaser
58 |         line_dic['releaserUrl'] = releaserUrl
59 |         line_dic['timestamp'] = timestamp
60 |         data_str=json.dumps(line_dic, ensure_ascii=False)
61 |         bulk_one_body = bulk_head + '\n' + data_str + '\n'
62 |         bulk_all_body += bulk_one_body
63 |         es.bulk(index='target_releasers', doc_type='doc',
64 |                 body=bulk_all_body, request_timeout=200)
65 |         bulk_all_body = ''
66 |         print('success')
67 | 
68 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/count_releaser_publish_num_and_refresh_crawler_frequency.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Dec 11 11:59:55 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import json
 9 | import time
10 | import elasticsearch
11 | import elasticsearch.helpers
12 | 
13 | hosts = '192.168.17.11'
14 | port = 80
15 | user_id = 'fangyucheng'
16 | password = 'VK0FkWf1fV8f'
17 | http_auth = (user_id, password)
18 | 
19 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
20 | 
21 | task_list = []
22 | result_list = []
23 | 
24 | es_scan = elasticsearch.helpers.scan(es, index='target_releasers')
25 | 
26 | for line in es_scan:
27 |     task_list.append(line)
28 | print('the length of releaser is %s' % len(task_list))
29 | 
30 | bulk_all_body = ''
31 | count = 0
32 | for line in task_list:
33 |     releaser_info = line['_source']
34 |     platform = releaser_info['platform']
35 |     releaser = releaser_info['releaser']
36 |     search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":platform}},
37 |                                               {"term":{"releaser.keyword":releaser}},
38 |                                               {"term":{"data_month":11}},
39 |                                               {"term":{"data_year":2018}},
40 |                                               {"term":{"stats_type.keyword":"new_released"}}]}}}
41 | 
42 |     es_search = es.search(index='releaser', doc_type='releasers',
43 |                           body=search_body)
44 |     if es_search['hits']['total'] != 0:
45 |         hits = es_search['hits']['hits'][0]['_source']['video_num']
46 |         releaser_info['Nov_2018'] = int(hits)
47 |         print("releaser %s hit %s video in es" % (releaser, hits))
48 |     else:
49 |         releaser_info['Nov_2018'] = 0
50 |     task_list.remove(line)
51 |     total = releaser_info['Nov_2018']
52 |     if total >= 900:
53 |         releaser_info['frequency'] = 9
54 |         print("%s frequency is 3" % releaser_info['releaser'])
55 |     if total >= 300:
56 |         releaser_info['frequency'] = 3
57 |         print("%s frequency is 3" % releaser_info['releaser'])
58 |         count += 1
59 |     else:
60 |         releaser_info['frequency'] = 1
61 |     _id = platform + '_' + releaser
62 |     bulk_head = '{"index": {"_id":"%s"}}' % _id
63 |     releaser_info['timestamp'] = int(time.time() * 1e3)
64 |     data_str = json.dumps(releaser_info, ensure_ascii=False)
65 |     bulk_one_body = bulk_head+'\n'+data_str+'\n'
66 |     bulk_all_body += bulk_one_body
67 |     es.bulk(index='target_releasers', doc_type='doc',
68 |             body=bulk_all_body)
69 |     bulk_all_body = ''
70 |     print('write %s into es' % releaser)
71 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/crawler_url_video_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 根据 url 抓取 页面的播放量等信息
 4 | 
 5 | @author: zhouyujiang
 6 | """
 7 | 
 8 | import time
 9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
10 | from crawler.crawler_sys.utils import trans_format
11 | 
12 | 
13 | 
14 | def get_target_video_info(platform, url):
15 |     crawler = get_crawler(platform=platform)
16 |     crawler_initialization = crawler()
17 |     new_playcount = crawler_initialization.check_play_count_by_video_page(url=url)
18 |     return new_playcount
19 | 
20 |             
21 | 
22 | 
23 | 
24 | 
25 | #if __name__ =='__main__':
26 | #    get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv',
27 | #                                              releaser_page_num_max=1000,
28 | #                                              es_index='crawler-data-raw',
29 | #                                              doc_type='doc'
30 | #                                              )


--------------------------------------------------------------------------------
/crawler_sys/tools/error.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/error.log


--------------------------------------------------------------------------------
/crawler_sys/tools/find_high_fre_releaser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Oct 23 10:03:53 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import configparser
 9 | from elasticsearch import Elasticsearch
10 | 
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
17 | 
18 | search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":"haokan"}}]}},
19 |                "sort":[{"Nov_2018":{"order":"desc"}}]}
20 | 
21 | es_search = es_connection.search(index='target_releasers',
22 |                                  doc_type='doc',
23 |                                  body=search_body, size=1000)
24 | 
25 | es_data_lst = es_search['hits']['hits']
26 | 
27 | result_list = []
28 | 
29 | for line in es_data_lst:
30 |     data_dic = line['_source']
31 |     result_list.append(data_dic)
32 | 
33 | new_list = result_list[:40]
34 | 
35 | result_list = []
36 | 
37 | releaser_dic = {}
38 | for line in new_list:
39 |     releaser_dic[line['releaser']] = line['releaserUrl']
40 | 
41 | 
42 | config = configparser.ConfigParser()
43 | config['haokan'] = releaser_dic
44 | 
45 | with open ('high_fre.ini', 'w', encoding='utf-8') as ini:
46 |     config.write(ini)
47 | 
48 | 
49 | 
50 | #special task
51 | #for line in source_lst:
52 | #    detail_lst = line['detail']
53 | #    csm_mdu = detail_lst[0]['csm_mdu']
54 | #    for detail_dic in detail_lst:
55 | #        detail_dic.pop('csm_mdu')
56 | #    line['csm_mdu'] = csm_mdu


--------------------------------------------------------------------------------
/crawler_sys/tools/match_play_count_by_video_title.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 23 13:48:33 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | from elasticsearch import Elasticsearch
 9 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
10 | 
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
17 | 
18 | unsolve_lst = []
19 | result_lst2 =[]
20 | 
21 | for line in task_list:
22 |     url = line['url']
23 |     title = line['title']
24 |     search_body = {"query": {"bool": {"filter": [{"term": {"title.keyword": title}}]}}}
25 |     search = es_connection.search(index="test2", doc_type="fyc1210", body=search_body)
26 |     if search["hits"]["total"] == 0:
27 |         unsolve_lst.append(url)
28 |         print("can not get video data at %s" % url)
29 |     else:
30 |         video_data = search["hits"]["hits"][0]["_source"]
31 |         result_lst2.append(video_data)
32 |         print("get playcount at %s" % url)
33 | 
34 | lst_to_csv(listname=result_lst2,
35 |            csvname="F:/add_target_releaser/Nov/Sep2.csv")


--------------------------------------------------------------------------------
/crawler_sys/tools/match_play_count_by_video_url.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 23 14:44:07 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import elasticsearch.helpers
 9 | from elasticsearch import Elasticsearch
10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
11 | from crawler.crawler_sys.utils.trans_format import str_file_to_lst
12 | from crawler.crawler_sys.utils.trans_format import str_lst_to_file
13 | #from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline
14 | 
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user_id = 'fangyucheng'
18 | password = 'VK0FkWf1fV8f'
19 | http_auth = (user_id, password)
20 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
21 | 
22 | 
23 | def init_task_list(file_path=None):
24 |     task_list = []
25 |     if file_path is None:
26 |         es_scan = elasticsearch.helpers.scan(es_connection, index='album-play-count')
27 |         for line in es_scan:
28 |             video_dict = line['_source']
29 |             task_list.append(video_dict)
30 |         return task_list
31 |     else:
32 |         task_list = str_file_to_lst(file_path)
33 |         return task_list
34 | 
35 | unsolve_lst = []
36 | result_lst2 =[]
37 | 
38 | 
39 | task_list = str_file_to_lst('F:/add_target_releaser/album_play_count/dec')
40 | #task_list = init_task_list()
41 | 
42 | 
43 | for line in task_list:
44 |     try:
45 |         if type(line) == dict:
46 |             url = line['url']
47 |         elif type(line) == str:
48 |             url = line
49 |         search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
50 |         search = es_connection.search(index="test2", doc_type="dec", body=search_body)
51 |         if search["hits"]["total"] == 0:
52 |             unsolve_lst.append(url)
53 |             print("can not get video data at %s" % url)
54 |         else:
55 |             video_data = search["hits"]["hits"][0]["_source"]
56 |             result_lst2.append(video_data)
57 |             print("get playcount at %s" % url)
58 |     except:
59 |         pass
60 | 
61 | lst_to_csv(listname=result_lst2,
62 |            csvname="F:/add_target_releaser/last_month/fix_play_count12242.csv")
63 | str_lst_to_file(unsolve_lst,
64 |                 filename="F:/add_target_releaser/last_month/unsolved")
65 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/ocr_by_aliyun.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/29 15:52 
 3 | # @Author : litao
 4 | 
 5 | 
 6 | import os,re
 7 | from aliyunsdkcore.client import AcsClient
 8 | from aliyunsdkcore.acs_exception.exceptions import ClientException
 9 | from aliyunsdkcore.acs_exception.exceptions import ServerException
10 | from aliyunsdkocr.request.v20191230.RecognizeCharacterRequest import RecognizeCharacterRequest
11 | from crawler.crawler_sys.framework.config.oss_keyword import AccessKeyId,AccessKeySecret
12 | import oss2
13 | from viapi.fileutils import FileUtils
14 | 
15 | 
16 | endpoint = "oss-cn-beijing.aliyuncs.com"
17 | # region = "v-plus-scope.oss-cn-beijing.aliyuncs.com"
18 | region = "v-plus-scope"
19 | oss_url_expries = 3600
20 | access_key_id = AccessKeyId
21 | access_key_secret = AccessKeySecret
22 | bucket_name = "v-plus-scope"
23 | auth = oss2.Auth(access_key_id, access_key_secret)
24 | bucket = oss2.Bucket(auth, endpoint, bucket_name)
25 | client = AcsClient(AccessKeyId, AccessKeySecret, 'cn-shanghai')
26 | 
27 | 
28 | def put_obj(access_key_id, access_key_secret, region, bucket_name, object_name):
29 |     fileobj_content = open("ocr_img/")
30 |     # oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR)
31 |     auth = oss2.Auth(access_key_id, access_key_secret)
32 |     bucket = oss2.Bucket(auth, region, bucket_name)
33 |     bucket.put_object(object_name, fileobj_content)
34 |     file_utils = FileUtils(AccessKeyId, AccessKeySecret)
35 |     oss_url = file_utils.get_oss_url("http://xxx.jpeg", "jpg", False)
36 | 
37 | def ocr_from_aliyun(file_name=""):
38 |     request = RecognizeCharacterRequest()
39 |     request.set_accept_format('json')
40 |     path_name = "ocr_img/%s"%file_name
41 |     put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name,)
42 |     request.set_ImageURL("http://explorer-image.oss-cn-shanghai.aliyuncs.com/270450672578492833/2020-05-01+200210.png?OSSAccessKeyId=LTAI4Fk9FstqSEYnqKJ5Dpeo&Expires=1590740750&Signature=ZggX6U2%2F3WvpSUpR9P8EYrD0vbQ%3D")
43 |     request.set_MinHeight(15)
44 |     request.set_OutputProbability(True)
45 | 
46 |     response = client.do_action_with_exception(request)
47 |     # python2:  print(response)
48 |     print(str(response, encoding='utf-8'))
49 | 
50 | 
51 | def file_path_scan(file_path):
52 |     for filename in os.listdir(file_path):
53 |         path = os.path.join(file_path, filename)
54 |         if not os.path.isfile(path):
55 |             continue
56 |         title = img_to_str(path, lang=Languages.CHS)
57 |         print(title)
58 |         try:
59 |             play_count = re.findall("\d+",title)[0]
60 |             #print(play_count)
61 |         except:
62 |             #print(title)
63 |             play_count= 0
64 |         yield filename,play_count
65 | 
66 | 
67 | file_path = r'D:\work_file\word_file_new\litao\num'
68 | for filename,play_count in file_path_scan(file_path):
69 |     time_str = filename.replace(".png","")
70 |     time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:]
71 |     # print(time_str)
72 |     print(time_str,play_count)


--------------------------------------------------------------------------------
/crawler_sys/tools/ocr_by_img.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/29 10:11 
 3 | # @Author : litao
 4 | 
 5 | from PIL import Image
 6 | import pytesseract,os,re
 7 | import cv2
 8 | import argparse
 9 | import cv2
10 | import os
11 | # construct the argument parse and parse the arguments
12 | ap = argparse.ArgumentParser()
13 | ap.add_argument("-p", "--preprocess", type=str, default="thresh",
14 | 	help="type of preprocessing to be done")
15 | args = vars(ap.parse_args())
16 | class Languages:
17 |     CHS = 'chi_sim'
18 |     ENG = 'eng'
19 | 
20 | def img_to_str(image_path, lang=Languages.CHS):
21 |     # img = Image.open(image_path)
22 |     # width, height = img.size
23 |     # img.show()
24 |     # mode = img.mode
25 | 
26 |     # print(img.size)
27 |     # thumb = img.crop((10,42,160,150))
28 |     # img.grab(0,0,250,200)
29 |     # thumb.save("thumb.jpg")
30 |     # image = cv2.imread(image_path)
31 |     # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
32 |     # # check to see if we should apply thresholding to preprocess the
33 |     # # image
34 |     # if args["preprocess"] == "thresh":
35 |     #     gray = cv2.threshold(gray, 0, 255,
36 |     #                          cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
37 |     # # make a check to see if median blurring should be done to remove
38 |     # # noise
39 |     # elif args["preprocess"] == "blur":
40 |     #     gray = cv2.medianBlur(gray, 3)
41 |     # # write the grayscale image to disk as a temporary file so we can
42 |     # # apply OCR to it
43 |     # filename = "thumb.png"
44 |     # cv2.imwrite(filename, gray)
45 |     # thumb = img.crop((40, 30, 100, 70))
46 |     #img.grab((30, 30, 150, 80))
47 |     # thumb.save("thumb.jpg")
48 |     # ,config="-psm 7 digits"
49 |     img = Image.open(image_path)
50 |     # thumb = img.crop((10,42,160,150))
51 |     # thumb = img.crop((40, 30, 100, 70))
52 |     thumb = img.crop((490, 0, 560, 60))
53 |     thumb.save("thumb.jpg")
54 |     return pytesseract.image_to_string(thumb, lang,config="-psm 7 digits")
55 | 
56 | def file_path_scan(file_path):
57 |     for filename in os.listdir(file_path):
58 |         path = os.path.join(file_path, filename)
59 |         if not os.path.isfile(path):
60 |             continue
61 |         title = img_to_str(path, lang=Languages.CHS)
62 |         print(title)
63 |         try:
64 |             play_count = re.findall("\d+",title)[0]
65 |             #print(play_count)
66 |         except:
67 |             #print(title)
68 |             play_count= 0
69 |         yield filename,play_count
70 | 
71 | 
72 | file_path = r'D:\work_file\word_file_new\litao\num'
73 | for filename,play_count in file_path_scan(file_path):
74 |     time_str = filename.replace(".png","")
75 |     time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:]
76 |     # print(time_str)
77 |     print(time_str,play_count)
78 | 
79 | # print(img_to_str(r'D:\work_file\word_file_new\litao\screen\2020-04-16 202632.png', lang=Languages.CHS))


--------------------------------------------------------------------------------
/crawler_sys/tools/tudou_selenium.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | import datetime
  3 | import re
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class Craler_tudou(object):
  8 |     def __init__(self):
  9 |         chrome_options = webdriver.ChromeOptions()
 10 |         prefs = {"profile.managed_default_content_settings.images": 2}
 11 |         chrome_options.add_experimental_option("prefs", prefs)
 12 |         self.driver = webdriver.Chrome(chrome_options=chrome_options)
 13 | 
 14 | 
 15 |     @staticmethod
 16 |     def video_time(time_str):
 17 |         now = datetime.datetime.now()
 18 |         if "分钟前" in time_str:
 19 |             min_str = re.findall(r"(\d+)分钟前",time_str)[0]
 20 |             videotime = now - datetime.timedelta(minutes=int(min_str))
 21 |         elif "小时前" in time_str:
 22 |             hour_str = re.findall(r"(\d+)小时前", time_str)[0]
 23 |             videotime = now - datetime.timedelta(hours=int(hour_str))
 24 |         elif "昨天" in time_str:
 25 |             date_lis = time_str.split(" ")
 26 |             hours, mins = date_lis[1].split(":")
 27 |             last_day = now - datetime.timedelta(days=1)
 28 |             videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
 29 |         elif "前天" in time_str:
 30 |             date_lis = time_str.split(" ")
 31 |             hours, mins = date_lis[1].split(":")
 32 |             last_day = now - datetime.timedelta(days=2)
 33 |             videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
 34 |         elif "天前" in time_str:
 35 |             day_str = re.findall(r"(\d+)天前", time_str)[0]
 36 |             videotime = now - datetime.timedelta(days=int(day_str))
 37 |         elif "刚刚" in time_str:
 38 |             videotime = now
 39 |         else:
 40 |             if str(now.year) in time_str:
 41 |                 pass
 42 |             else:
 43 |                 date_lis = time_str.split(" ")
 44 |                 month,days = date_lis[0].split("-")
 45 |                 hours,mins = date_lis[1].split(":")
 46 |                 videotime = datetime.datetime(year=int(now.year),month=int(month),day=int(days),hour=int(hours),minute=int(mins))
 47 | 
 48 |         # print(videotime.strftime("%Y-%m-%d %H:%M:%S"))
 49 |         return videotime
 50 | 
 51 |     def time_range_video_num(self,start_time,end_time,url_list):
 52 |         data_lis = []
 53 |         info_lis = []
 54 |         columns = [""]
 55 |         for dic in url_list:
 56 |             for res in self.get_page(dic["url"]):
 57 |                 title,link,video_time = res
 58 |                 print(res)
 59 |                 if start_time < video_time < end_time:
 60 |                     data_lis.append((title,link,video_time,dic["url"]))
 61 |                 else:
 62 |                     break
 63 |             csv_save = pd.DataFrame(data_lis)
 64 |             csv_save.to_csv("%s.csv" % (dic["platform"] + "_" + dic["releaser"]),encoding="GBK")
 65 |             info_lis.append([dic["platform"],dic["releaser"],len(data_lis)])
 66 |             data_lis = []
 67 |         csv_save = pd.DataFrame(info_lis)
 68 |         csv_save.to_csv("%s.csv" % (datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")), encoding="GBK")
 69 |         self.driver.quit()
 70 | 
 71 |     def get_page(self,url):
 72 |         #video_page = self.driver.get(url)
 73 |         # js = 'window.open("%s");' % url
 74 |         # self.driver.execute_script(js)
 75 |         page_num = -1
 76 |         try:
 77 |             video_page = self.driver.get(url)
 78 |             while True:
 79 |                 page_num += 1
 80 |                 if page_num != 0:
 81 |                     self.driver.find_element_by_class_name("next").click()
 82 |                 video_lis = self.driver.find_elements_by_xpath("/html/body/div[2]/div/div[3]/div/div/div/div[2]/div/div/div/div[1]/div")
 83 |                 for v in video_lis:
 84 |                     v_a = v.find_element_by_xpath("./div[2]/a")
 85 |                     title = v_a.get_attribute("title")
 86 |                     link = v_a.get_attribute("href")
 87 |                     video_time = self.video_time(v.find_element_by_class_name("v-publishtime").text)
 88 | 
 89 |                     yield (title,link,video_time)
 90 | 
 91 |         except Exception as e:
 92 |             raise e
 93 |             print(e)
 94 |             print("page %s has no more data" % page_num)
 95 | 
 96 | if __name__ == "__main__":
 97 |     test = Craler_tudou()
 98 |     url_lis = [
 99 |         {"platform":"new_tudou",
100 |         "url":"https://id.tudou.com/i/UNTk2NjE0MDM4NA==/videos?",
101 |          "releaser":"酷娱文化先锋"
102 |          },
103 |         {"platform": "new_tudou",
104 |          "url": "https://id.tudou.com/i/UMTQ3MDM0MjAw/videos?",
105 |          "releaser": "酷娱文化先锋"
106 |          }]
107 |     start_time = datetime.datetime(year=2019,month=6,day=6)
108 |     end = datetime.datetime.now()
109 |     test.time_range_video_num(start_time,end,url_lis)


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count.rar


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Anaconda" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/video_num_count.iml" filepath="$PROJECT_DIR$/.idea/video_num_count.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/video_num_count.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="inheritedJdk" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="TestRunnerService">
11 |     <option name="projectConfiguration" value="pytest" />
12 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
13 |   </component>
14 | </module>


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/2019-09-10.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/2019-09-10.csv


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/count.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/count.csv


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/count_main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/6/11 15:26 
 3 | # @Author : litao
 4 | import datetime
 5 | 
 6 | from crawler.haokan_count import *
 7 | from crawler.qq_video_count import *
 8 | from crawler.toutiao_count import *
 9 | from crawler.tudou_count import *
10 | from crawler.crawler_wangyi_news import *
11 | from crawler.crawler_tencent_news import *
12 | from concurrent.futures import ProcessPoolExecutor
13 | 
14 | craler_site = {
15 |         "haokan": Craler_haokan(),
16 |         "toutiao": Craler_toutiao(),
17 |         "腾讯视频": Craler_qq(),
18 |         "new_tudou": Craler_tudou(),
19 |         "网易新闻": Crawler_wangyi_news(),
20 |         "腾讯新闻": Crawler_Tencent_News()
21 | }
22 | 
23 | 
24 | def start_count(releaser, platform, releaserUrl, re_s_t, re_e_t):
25 |     craler = craler_site.get(platform)
26 |     if craler:
27 |         url_lis = [
28 |                 {
29 |                         "platform": platform,
30 |                         "url": releaserUrl,
31 |                         "releaser": releaser
32 |                 }
33 |         ]
34 |         craler.time_range_video_num(re_s_t, re_e_t, url_lis)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     miaopai_list = []
39 |     platform_dic = {
40 |             "haokan": [],
41 |             "toutiao": [],
42 |             "腾讯视频": [],
43 |             "new_tudou": [],
44 |             "网易新闻": [],
45 |             "腾讯新闻":[]
46 |     }
47 |     file = r'count.csv'
48 |     #file = r'D:\wxfile\WeChat Files\litaolemo\FileStorage\File\2019-07\count(3).csv'
49 |     now = int(datetime.datetime.now().timestamp() * 1e3)
50 |     executor = ProcessPoolExecutor(max_workers=6)
51 |     futures = []
52 |     with open(file, 'r', encoding="gb18030")as f:
53 |         header_Lst = f.readline().strip().split(',')
54 |         for line in f:
55 |             line_Lst = line.strip().split(',')
56 |             line_dict = dict(zip(header_Lst, line_Lst))
57 |             releaser = line_dict['releaser']
58 |             platform = line_dict['platform']
59 |             releaserUrl = line_dict['releaserUrl']
60 |             re_s_t = line_dict['开始时间']
61 |             re_e_t = line_dict['结束时间']
62 |             start_time_lis = re_s_t.split("/")
63 |             end_time_lis = re_e_t.split("/")
64 |             start_time_stamp = int(datetime.datetime(year=int(start_time_lis[0]), month=int(start_time_lis[1]),
65 |                                                      day=int(start_time_lis[2])).timestamp() * 1e3)
66 |             end_time__stamp = int(datetime.datetime(year=int(end_time_lis[0]), month=int(end_time_lis[1]),
67 |                                                     day=int(end_time_lis[2])).timestamp() * 1e3)
68 |             # future = executor.submit(start_count, releaser, platform, releaserUrl, start_time_stamp, end_time__stamp)
69 |             # futures.append(future)
70 |             start_count(releaser, platform, releaserUrl, start_time_stamp, end_time__stamp)
71 |     executor.shutdown(True)
72 |     print('+++>完成')
73 |     # for future in futures:
74 |     #     print(future.result())
75 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/2019-09-10.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/2019-09-10.csv


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/11 15:59 
3 | # @Author : litao
4 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/trans_duration_str_to_second.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Sep 10 15:04:04 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | def trans_duration(duration_str):
 9 |     """suitable for 20:20, 20:20:10"""
10 |     duration_lst = duration_str.split(':')
11 |     if len(duration_lst) == 3:
12 |         duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2]))
13 |         return duration
14 |     elif len(duration_lst) == 2:
15 |         duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1]))
16 |         return duration
17 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/首次运行.bat:
--------------------------------------------------------------------------------
1 | python -m pip install --upgrade pip
2 | python -m pip install xlutils
3 | python -m pip install requests
4 | python -m pip install bs4
5 | python -m pip install pandas


--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv


--------------------------------------------------------------------------------
/crawler_sys/tools/write_feihua_json_to_es.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import datetime
  4 | import elasticsearch
  5 | from write_data_into_es.func_cal_doc_id import *
  6 | hosts = '192.168.17.11'
  7 | port = 80
  8 | user = 'zhouyujiang'
  9 | passwd = '8tM9JDN2LVxM'
 10 | http_auth = (user, passwd)
 11 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
 12 | from write_data_into_es.func_get_releaser_id import get_releaser_id
 13 | 
 14 | def dic():
 15 |     url_data = []
 16 |     format_data = {}
 17 |     f = open("E:\M_2019-05-22_U_3", "r")
 18 |     for i in f.readlines():
 19 |         url_data.append(json.loads(i))
 20 |     for i in url_data:
 21 |         format_data[i["url"]] = ""
 22 |     print(len(format_data))
 23 | 
 24 | def get_data():
 25 |     url_data = []
 26 |     format_data = []
 27 |     # url = "https://enlightent-backup.oss-cn-beijing.aliyuncs.com/csm/20190318-20190324/csm_douyin_20190318_20190324.json?Expires=1554243367&OSSAccessKeyId=TMP.AQG2JUd3g4Gv66npoCNJPVnH-r9yRqhMGwqJtilxiBCDsbRJJ4kTuiE_T17CMC4CFQC8gXq7WHE73SSE9s2DjpWzF7Y2TwIVAIeJz9r0QHkaPi8FGyzN1TXmsjvn&Signature=XsHnMu%2B4agHS6Z6tq%2B55WWaZjDk%3D"
 28 |     # res = requests.get(url)
 29 |     # with open("./url_json.json","w") as f:
 30 |     #     f.write(res.text)
 31 | 
 32 |     # f = open("E:\M_2019-05-22_U_3", "r")
 33 |     # for i in f.readlines():
 34 |     #     url_data.append(json.loads(i))
 35 |     for i in url_data:
 36 |         print(i)
 37 |         format_data.append(
 38 |             {
 39 |                 "platform": i["platform"],
 40 |                 "duration": i["duration"],
 41 |                 "favorite_count": i["favorite"],
 42 |                 "fetch_time": int(i["crawledtime"])*1000,
 43 |                 "play_count": i["playtimes"],
 44 |                 "release_time": i["releasetime"],
 45 |                 "releaser": i["releaser"],
 46 |                 "title": i["title"],
 47 |                 "url": i["url"],
 48 |                 "comment_count": i["commentnum"],
 49 |                 "dislike_count": 0,
 50 |                 "isOriginal": False,
 51 |                 "releaserUrl": i["releaserurl"],
 52 |                 "repost_count": 0,
 53 |                 "timestamp": int(datetime.datetime.timestamp(datetime.datetime.now()))*1000,
 54 |                 "data_provider": "fhtech",
 55 |                 "channel": i["channel"],
 56 |                 "releaser_id_str":"miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=i["releaserurl"])
 57 |             }
 58 |         )
 59 |     return format_data
 60 | # target_date_list = target_type.split('-')
 61 | # target_date_start = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]), 1)
 62 | # target_date_end = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]) + 1, 1)
 63 | # target_ts_start = int(target_date_start.timestamp()) * 1000
 64 | # target_ts_end = int(target_date_end.timestamp()) * 1000
 65 | # print(target_ts_start)
 66 | # print(target_ts_end)
 67 | 
 68 | 
 69 | def write_es(file):
 70 |     count = 0
 71 |     bulk_all_body = ""
 72 |     doc_id_type = "all-time-url"
 73 |     for i in file:
 74 |         #print(i)
 75 |         # format_i = {}
 76 |         # a = "format_i = %s" % i
 77 |         # exec(a,format_i)
 78 |         format_i = json.loads(i)
 79 |         # format_i = format_i["format_i"]
 80 |         # print(format_i)
 81 |         try:
 82 |             _id = cal_doc_id(platform=format_i["platform"], url=format_i["url"], doc_id_type=doc_id_type,data_dict=format_i)
 83 |             format_i["timestamp"] = int(datetime.datetime.now().timestamp()*1e3)
 84 |             if len(str(format_i["release_time"])) != 13:
 85 |                 print(format_i["release_time"])
 86 |                 format_i["release_time"] = int(format_i["release_time"] / 1000)
 87 |             format_i["releaser_id_str"] = "miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=format_i["releaserUrl"])
 88 |             bulk_head = '{"index": {"_id":"%s"}}' % _id
 89 |         except Exception as e:
 90 |             print(e)
 91 |             continue
 92 |         # find_exist = {
 93 |         #     "query": {
 94 |         #         "bool": {
 95 |         #             "filter": [
 96 |         #                 {"term": {"_id":_id }}
 97 |         #             ]
 98 |         #         }
 99 |         #     }
100 |         # }
101 |         # search_re = es.search(index=target_index, doc_type=target_type,
102 |         #                       body=find_exist)
103 |         # if search_re['hits']['total'] == 1:
104 |         #     if counti % 1000 == 0:
105 |         #         print("done ", counti,"\n")
106 |         #         return None
107 |         # else:
108 |         #     pass
109 | 
110 |         data_str = json.dumps(format_i, ensure_ascii=False)
111 |         bulk_one_body = bulk_head + '\n' + data_str + '\n'
112 |         bulk_all_body += bulk_one_body
113 |         count += 1
114 |         print("find",count)
115 |         if count % 1000 == 0:
116 |             eror_dic = es.bulk(index=target_index, doc_type=target_type,
117 |                                body=bulk_all_body, request_timeout=200)
118 |             bulk_all_body = ''
119 |             if eror_dic['errors'] is True:
120 |                 print(eror_dic['items'])
121 |                 print(bulk_all_body)
122 |             print(count)
123 | 
124 |     if bulk_all_body != '':
125 |         eror_dic = es.bulk(body=bulk_all_body,
126 |                            index=target_index,
127 |                            doc_type=target_type,
128 |                            request_timeout=200)
129 |         if eror_dic['errors'] is True:
130 |             print(eror_dic)
131 |             bulk_all_body = ''
132 |             #print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     target_index = 'short-video-all-time-url'
137 |     target_type = 'all-time-url'
138 |     m3 = open(r"C:\Users\litao\Desktop\csv\202002\M_2020-02-04_U_3", "r", encoding="utf-8")
139 |     # f = open("exists", "a+")
140 |     write_es(m3)
141 | 


--------------------------------------------------------------------------------
/crawler_sys/tools/头条重复数据导出.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Feb 14 16:12:57 2019
 4 | 
 5 | @author: zhouyujiang
 6 | 
 7 | 查找切片中头条发布者+发布时间+duration相同的数据
 8 | """
 9 | 
10 | import pandas as pd
11 | import datetime
12 | import elasticsearch
13 | from elasticsearch.helpers import scan
14 | from crawler_url_video_info import get_target_video_info
15 | hosts='192.168.17.11'
16 | port=80
17 | user='zhouyujiang'
18 | passwd='8tM9JDN2LVxM'
19 | http_auth=(user, passwd)
20 | es=elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
21 | 
22 | 
23 | 
24 | zyj_set = set()
25 | zyj_dict = {}
26 | index = 'short-video-weekly'
27 | doc_type = 'daily-url-2019_w07_s1'
28 | re_s_t = 1549728000000
29 | re_e_t = 1550332800000
30 | count = 0 
31 | sacn_body =  {  
32 |             "query": {
33 |                 "bool": {
34 |                   "filter": [
35 |                     {"term": {"platform.keyword": 'toutiao'}},
36 |                     {"range": {"release_time": {"gte": re_s_t,"lt":re_e_t}}}
37 |                     ]
38 |                 }
39 |                   }
40 |                    }
41 | scan_re = scan(client=es, index=index, doc_type=doc_type,
42 |               query=sacn_body, scroll='3m')
43 | for one in scan_re:
44 |     count = count +1
45 |     if count %1000 == 0:
46 |         print(count)
47 |     line = one['_source']
48 |     releaser = line['releaser']
49 |     release_time = line['release_time']
50 |     duration = line['duration']
51 |     zyj_id = releaser + str(release_time) + str(duration)
52 |     if zyj_id not in zyj_dict:
53 |         zyj_dict[zyj_id] = []
54 |         zyj_dict[zyj_id].append(line)
55 |     else:
56 |         zyj_set.add(zyj_id)
57 |         zyj_dict[zyj_id].append(line)
58 | re_list = []
59 | for one_key in zyj_set:
60 |     for one_value in zyj_dict[one_key]:
61 | #        url = one_value['url']
62 | #        new_playcount = get_target_video_info(url=url, platform='toutiao')
63 | #        one_value['new_playcount'] = new_playcount
64 |         re_list.append(one_value)
65 |         
66 | 
67 |     
68 |             
69 | 
70 | data = pd.DataFrame(re_list)
71 | data.to_csv('头条7zhou重复数据重新抓取播放量.csv')
72 |         
73 |         


--------------------------------------------------------------------------------
/crawler_sys/utils/__init__ .py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue June 6 18:02:43 2018
4 | 
5 | @author: hanye
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/connect_with_es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jun  6 18:18:09 2018
 4 | 
 5 | @author: hanye
 6 | """
 7 | #import redis
 8 | #from crawler_sys.framework.platform_redis_register import get_redis_list_name
 9 | from crawler.crawler_sys.framework.es_crawler import scan_crawler_url_register
10 | 
11 | #rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0)
12 | 
13 | def pull_url_from_es(platform, release_time_lower_bdr=None):
14 |     """
15 |     Just pull urls from es index crawler-url-register.
16 |     Url reforming things will be done in the method who
17 |     is responsible for pushing urls into redis.
18 |     Just return url and its platform 
19 |     """
20 |     if release_time_lower_bdr is None:
21 |         release_time_lower_bdr = 0
22 |     else:
23 |         pass
24 |     search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
25 |                                                  {"gte": release_time_lower_bdr}}},
26 |                                                  {"term": {"platform.keyword": platform}}]}}}
27 |     total_hit, scan_resp = scan_crawler_url_register(search_body)
28 |     batch_url_Lst = []
29 |     if total_hit > 0:
30 |         line_counter = 0
31 |         for line in scan_resp:
32 |             line_counter += 1
33 |             line_d = line['_source']
34 |             url = line_d['url']
35 |             batch_url_Lst.append(url)
36 |     else:
37 |         pass
38 |     return batch_url_Lst
39 | 
40 | 
41 | #def url_reformer(platform, url):
42 | #    """
43 | #    to reform url according to platform, in the future.
44 | #    Say, a url of http://www.toutiao.com/group/1234567890123456789
45 | #    as a string is different from http://www.365yg.com/u/1234567890123456789,
46 | #    but they point to the same resource. They should be reformed
47 | #    to one unique url before pushing into redis for futher crawling.
48 | #    """
49 | #    reformed_url = url
50 | #    return reformed_url
51 | #
52 | #def feed_url_into_redis(dict_Lst, platform,
53 | #                        release_time_lower_bdr=None,
54 | #                        batch_str=None):
55 | #    """
56 | #    release_time_lower_bdr must be an int value represent
57 | #    timestamp in milliseconds if given.
58 | #    All url that is released before release_time_lower_bdr
59 | #    will not be pushed into redis. If argument release_time_lower_bdr
60 | #    is not given when call this function, all urls will be
61 | #    pushed into redis.
62 | #    """
63 | #    redis_list_name = get_redis_list_name(platform, batch_str)
64 | #    if redis_list_name is None:
65 | #        print('Failed to get correct redis list name '
66 | #              'in platform_redis_register for platform: '
67 | #              % platform)
68 | #        return (None, None)
69 | #    else:
70 | #        print('Feeding url into redis list %s ...' % redis_list_name)
71 | #        url_counter = 0
72 | #        for data_dict in dict_Lst:
73 | #            try:
74 | #                url = data_dict['url']
75 | #                url_reformed = url_reformer(platform, url)
76 | #                if release_time_lower_bdr is None:
77 | #                    sadd_c = rds.sadd(redis_list_name, url_reformed)
78 | #                    url_counter += sadd_c
79 | #                else:
80 | #                    url_release_time = data_dict['release_time']
81 | #                    if url_release_time >= release_time_lower_bdr:
82 | #                        sadd_c = rds.sadd(redis_list_name, url_reformed)
83 | #                        url_counter += sadd_c
84 | #            except:
85 | #                print('Failed to push url into redis, '
86 | #                      'might because of lack of url field '
87 | #                      'or lack of release_time field, or '
88 | #                      'has wrong typed release_time value. '
89 | #                      'The failed data dict is: \n %s' % data_dict)
90 | #        print('Pushed %d urls into redis' % url_counter)
91 | #        return (redis_list_name, url_counter)
92 | 
93 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/date_calculator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Nov 21 09:08:29 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import datetime
 9 | 
10 | def calculator(shifting_days=30,
11 |                shifting_hours=0,
12 |                shifting_minutes=0):
13 |     now = datetime.datetime.now()
14 |     if shifting_hours == 0 and shifting_minutes == 0 and shifting_days != 0:
15 |         date_shift = now - datetime.timedelta(days=shifting_days)
16 |         date_shift_str = str(date_shift)[:10]
17 |         date_wanted = datetime.datetime.strptime(date_shift_str,
18 |                                                  "%Y-%m-%d").timestamp() * 1e3
19 |         return int(date_wanted)


--------------------------------------------------------------------------------
/crawler_sys/utils/delete_repeat_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Sep 30 09:36:55 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import pymysql
10 | from crawler_sys.utils.write_into_database import write_lst_into_database
11 | 
12 | connection = pymysql.connect(host='localhost', 
13 |                              user='root',
14 |                              passwd='goalkeeper@1',
15 |                              db='proxy_pool', 
16 |                              port=3306,
17 |                              cursorclass=pymysql.cursors.DictCursor)
18 | cursor = connection.cursor()
19 | 
20 | search_sql = "select * from proxy_pool"
21 | cursor.execute(search_sql)
22 | 
23 | test_lst = cursor.fetchall()
24 | 
25 | new_lst = []
26 | ip_lst = []
27 | for line in test_lst:
28 |     if line['ip_address'] not in ip_lst:
29 |         new_lst.append(line)
30 |         ip_lst.append(line['ip_address'])
31 | 
32 | delect_sql = "delete from proxy_pool where id >= 1"
33 | cursor.execute(delect_sql)
34 | connection.commit()
35 | 
36 | write_lst_into_database(data_lst=new_lst,
37 |                         table_name='proxy_pool',
38 |                         host='localhost',
39 |                         passwd='goalkeeper@1')


--------------------------------------------------------------------------------
/crawler_sys/utils/extract_data_from_es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Oct  8 17:14:16 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import elasticsearch
 9 | 
10 | 
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | lose_re_url = []
17 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
18 | 
19 | search_body = {"query": {
20 |                          "bool": {
21 |                                   "filter": [
22 |                                              {"term": {"platform.keyword": "new_tudou"}},
23 |                                              {"term": {"post_by.keyword": "zhangqiongzi"}}
24 |                                             ]
25 |                                   }
26 |                         }
27 |                 }
28 | 
29 | 
30 | get_tr = es.search(index='target_releasers', body=search_body, size=200)
31 | 
32 | result_lst = []
33 | 
34 | for line in get_tr['hits']['hits']:
35 |     result_lst.append(line['_source'])


--------------------------------------------------------------------------------
/crawler_sys/utils/fakechrome.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Aug 16 17:12:37 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import random
 9 | 
10 | first_num = random.randint(55, 62)
11 | third_num = random.randint(0, 3200)
12 | fourth_num = random.randint(0, 140)
13 | 
14 | class Fakechrome:
15 |     os_type = ['(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
16 |                '(Macintosh; Intel Mac OS X 10_12_6)']
17 | 
18 |     chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
19 | 
20 |     @classmethod
21 |     def get_ua(cls):
22 |         return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36',
23 |                          '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36'])
24 | 
25 | headers = {'User-Agent': Fakechrome.get_ua(),
26 |            'Accept-Encoding': 'gzip, deflate, sdch',
27 |            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
28 |            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 |            'Connection': 'keep-alive'}
30 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/fix_album_play_count_by_releaser_page.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 23 14:44:07 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | from elasticsearch import Elasticsearch
10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
11 | 
12 | 
13 | hosts = '192.168.17.11'
14 | port = 80
15 | user_id = 'fangyucheng'
16 | password = 'VK0FkWf1fV8f'
17 | http_auth = (user_id, password)
18 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
19 | 
20 | unsolve2_lst = []
21 | result_lst2 =[]
22 | 
23 | for url in unsolve_lst:
24 |     search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
25 |     search = es_connection.search(index="test2", doc_type="fyc1123", body=search_body)
26 |     if search["hits"]["total"] == 0:
27 |         unsolve2_lst.append(url)
28 |         print("can not get video data at %s" % url)
29 |     else:
30 |         video_data = search["hits"]["hits"][0]["_source"]
31 |         result_lst2.append(video_data)
32 |         print("get playcount at %s" % url)
33 | 
34 | lst_to_csv(listname=result_lst2,
35 |            csvname="F:/add_target_releaser/Nov/get_playcount_by_releaser2.csv")


--------------------------------------------------------------------------------
/crawler_sys/utils/get_toutiao_as_cp_signature.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed May 23 12:37:48 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import requests
 9 | # import js2py
10 | import hashlib
11 | 
12 | 
13 | def as_cp(user_id, max_behot_time):
14 |     as_cp = []
15 |     t = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3)')
16 |     i = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3),i = t.toString(16).toUpperCase()')
17 |     e = hashlib.md5(str(t).encode('utf-8')).hexdigest()
18 |     if len(i) != 8:
19 |         var_as = "479BB4B7254C150"
20 |         cp = "7E0AC8874BB0985"
21 |     else:
22 |         e = e.upper()
23 |         s = e[0:5]
24 |         o = e[-5:]
25 | 
26 |         n = ''
27 |         a = 0
28 |         while a < 5:
29 |             n = n + s[a] + i[a]
30 |             a += 1
31 | 
32 |         l = ''
33 |         r = 0
34 |         while r < 5:
35 |             l = l + i[r+3] + o[r]
36 |             r = r + 1
37 |     var_as = 'A1' + n + i[-3:]
38 |     cp = i[0:3] + l + 'E1'
39 | 
40 |     as_cp.append(var_as)
41 |     as_cp.append(cp)
42 |     return as_cp
43 | 
44 | 
45 | def signature(user_id, max_behot_time):
46 |     jsurl = 'https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_f62209a.js'
47 |     get_page = requests.get(jsurl)
48 |     get_page.encoding = 'utf-8'
49 |     page = get_page.text
50 |     effect_js = page.split('Function')
51 |     js_1 = ('var navigator = {"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) '
52 |              'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"};')
53 |     js_2 = 'Function' + effect_js[3]
54 |     js_3 = 'Function' + effect_js[4]
55 |     js_4 = ';function result(){ return TAC.sign('+user_id+''+max_behot_time+');} result();'
56 |     js_total = js_1+js_2+js_3+js_4
57 |     signature = js2py.eval_js(js_total)
58 |     return signature
59 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/output_log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Dec 19 12:54:20 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import logging
 9 | formatter = logging.Formatter('%(asctime)s %(name)s %(filename)s '
10 |                               '%(funcName)s %(levelname)s %(message)s')
11 | 
12 | def init_logger(name, log_file, level=logging.INFO):
13 |     """initialize logger"""
14 |     #output log to file
15 |     handler = logging.FileHandler(log_file)
16 |     handler.setFormatter(formatter)
17 |     #output log to screen
18 |     console = logging.StreamHandler()
19 |     console.setFormatter(formatter)
20 |     #initialize logger
21 |     logger = logging.getLogger(name)
22 |     logger.setLevel(level)
23 |     #add handler and console to logger
24 |     logger.addHandler(handler)
25 |     logger.addHandler(console)
26 |     return logger


--------------------------------------------------------------------------------
/crawler_sys/utils/parse_bool_for_args.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jun 29 12:46:19 2018
 4 | 
 5 | @author: hanye
 6 | """
 7 | 
 8 | def parse_bool_for_args(arg_str):
 9 |     if arg_str.lower() in ('true', 'yes', 'y', '1'):
10 |         return True
11 |     elif arg_str.lower() in ('false', 'no', 'n', '0'):
12 |         return False
13 |     else:
14 |         print('Illegal input! Bool like string values are needed.')
15 |         return None
16 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/releaser_url_check.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Aug  6 09:42:41 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | from crawler_sys.framework import platform_crawler_register
10 | 
11 | 
12 | def test_releaserUrl(test_lst):
13 |     for line in test_lst:
14 |         try:
15 |             platform = line['platform']
16 |             platform_crawler = platform_crawler_register.get_crawler(platform)
17 |             releaserUrl = line['releaserUrl']
18 |             try:
19 |                 platform_crawler().releaser_page(releaserUrl=releaserUrl, 
20 |                                                  releaser_page_num_max=1)
21 |                 line['True_or_False'] = 1
22 |                 line['add_mess'] = 'correct'
23 |                 print('get releaser page')
24 |                 print(line)
25 |                 yield line
26 |             except:
27 |                 line['True_or_False'] = 0
28 |                 line['add_mess'] = 'wrong_url'
29 |                 print('%s can not get vaild info' % releaserUrl)
30 |         except:
31 |             pass
32 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/remove_unrelated_parameter_tudou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Nov 26 13:14:03 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | import urllib
10 | import requests
11 | 
12 | domain = 'http://www.tudou.com/api/getfeeds?'
13 | #domain_simple = 'http://r.inews.qq.com/searchMore'
14 | #headers = {"Host": "r.inews.qq.com",
15 | #           "Accept-Encoding": "gzip,deflate",
16 | #           "Referer": "http://inews.qq.com/inews/android/",
17 | #           "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)",
18 | #           "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;",
19 | #           "Connection": "Keep-Alive"}
20 | 
21 | url_dic = {'apptype': '27',
22 |            'pg': '8',
23 |            'module': '10',
24 |            'feedtype': '2',
25 |            'guid': '15010560241384qU',
26 |            'uid': '-1',
27 |            'secCateId': '10016',
28 |            'max_offset': '-1',
29 |            'min_offset': '-1',
30 |            'utdid': 'T8v9EQPOimUCAXL/Az0YrDOB',
31 |            'page_size': '24'}
32 | 
33 | #sig = 'c022edce8ae72f053304412f13a9bb88'
34 | 
35 | url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic)
36 | 
37 | """
38 | raw-url
39 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862'
40 | """
41 | 
42 | get_page = requests.get(url)
43 | page = get_page.text
44 | 
45 | key_lst = []
46 | para_lst = []
47 | 
48 | for key, value in url_dic.items():
49 |     key_lst.append(key)
50 | 
51 | for key in key_lst:
52 |     value = url_dic[key]
53 |     url_dic.pop(key)
54 |     url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic)
55 |     get_page = requests.get(url)
56 |     page = get_page.text
57 |     time.sleep(5)
58 |     if len(page) > 15000:
59 |         print("%s can be moved from url, length of page is %s" % (key, len(page)))
60 |         continue
61 |     else:
62 |         url_dic[key] = value
63 |         print("key %s, value %s can't be moved from url" % (key, value))
64 | 
65 | #this the result
66 | final_url_dic = {'devid': '008796749793280',
67 |                  'appver': '23_android_5.4.10',
68 |                  'query': 'espn',
69 |                  'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47',
70 |                  'qn-sig': 'c022edce8ae72f053304412f13a9bb88'}
71 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic)
72 | 
73 | """
74 | final_url
75 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88'
76 | """
77 | 
78 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/trans_duration_str_to_second.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Sep 10 15:04:04 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | def trans_duration(duration_str):
 9 |     """suitable for 20:20, 20:20:10"""
10 |     if type(duration_str) == int:
11 |         return duration_str
12 |     duration_lst = duration_str.split(':')
13 |     if len(duration_lst) == 3:
14 |         duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2]))
15 |         return duration
16 |     elif len(duration_lst) == 2:
17 |         duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1]))
18 |         return duration
19 |     else:
20 |         return duration_lst[0]
21 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/trans_format.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May 14 13:35:26 2018
  4 | 
  5 | @author: fangyucheng
  6 | """
  7 | 
  8 | 
  9 | import json
 10 | import pandas as pd
 11 | 
 12 | 
 13 | def dic_file_to_lst(filename):
 14 |     openfile = open(filename)
 15 |     task = []
 16 |     for line in openfile:
 17 |         line_dic = json.loads(line)
 18 |         task.append(line_dic)
 19 |     return task
 20 | 
 21 | 
 22 | def str_file_to_lst(filename):
 23 |     openfile = open(filename, 'r', encoding='utf-8')
 24 |     task = []
 25 |     for line in openfile:
 26 |         line_str = line.replace('\n','') 
 27 |         task.append(line_str)
 28 |     return task
 29 | 
 30 | 
 31 | def csv_to_lst(csvname):
 32 |     openfile = open(csvname, 'r')
 33 |     task = []
 34 |     for line in openfile:
 35 |         task.append(line)
 36 |     task_lst = []
 37 |     for line in task:
 38 |         line_lst = line.split(',')
 39 |     task_lst.append(line_lst)
 40 |     return task_lst
 41 | 
 42 | 
 43 | def csv_to_lst_with_headline(csvname):
 44 |     result_lst = []
 45 |     openfile = open(csvname, 'r', encoding='gb18030')
 46 |     head = openfile.readline()
 47 |     head = head.replace('\n', '')
 48 |     head_lst = head.strip().split(',')
 49 |     for line in openfile:
 50 |         line = line.replace('\n', '')
 51 |         line_lst = line.strip().split(',')
 52 |         test_dict = dict(zip(head_lst,line_lst))
 53 |         result_lst.append(test_dict)
 54 |     return result_lst
 55 | 
 56 | 
 57 | def dic_lst_to_file(listname, filename):
 58 |     file = open(filename, 'a')
 59 |     for line in listname:
 60 |         json_line = json.dumps(line)
 61 |         file.write(json_line)
 62 |         file.write('\n')
 63 |     file.flush()
 64 |     file.close()
 65 | 
 66 | 
 67 | def str_lst_to_file(listname, filename):
 68 |     file = open(filename, 'a', encoding='gb18030')
 69 |     for line in listname:
 70 |         file.write(line)
 71 |         file.write('\n')
 72 |     file.flush()
 73 |     file.close()
 74 | 
 75 | 
 76 | def lst_to_csv_practical(lst_name, csv_name):
 77 |     dataframe=pd.DataFrame(lst_name)
 78 |     if len(str(lst_name[0]['release_time'])) == 10:
 79 |         dataframe['midstep'] = dataframe['release_time']+8*3600
 80 |         dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='s')
 81 |     else:
 82 |         dataframe['midstep'] = dataframe['release_time']+8*3600*1e3
 83 |         dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='ms')
 84 |     del dataframe['midstep']
 85 |     del dataframe['release_time']
 86 |     del dataframe['fetch_time']
 87 |     dataframe.to_csv(csv_name, encoding='gb18030', index=False)
 88 | 
 89 | 
 90 | def lst_to_csv(listname, csvname):
 91 |     dataframe = pd.DataFrame(listname)
 92 |     dataframe.to_csv(csvname, encoding='gb18030', index=False)
 93 | 
 94 | 
 95 | def csv_to_file(filename, csvname):
 96 |     file = open(filename)
 97 |     task = []
 98 |     for line in task:
 99 |         json_line = json.dumps(line)
100 |         file.write(json_line)
101 |         file.write('\n')
102 |     file.flush()
103 |     dataframe = pd.DataFrame(task)
104 |     dataframe.to_csv(csvname, encoding='gb18030', index=False)
105 |     file.close()
106 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/trans_str_play_count_to_int.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Sep 10 14:41:54 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | def trans_play_count(play_count_str):
 9 |     """suitable for the format 22万, 22万次播放, 22.2万, 2,222万, 2,222.2万, 2,222, 222"""
10 |     if isinstance(play_count_str,int):
11 |         return play_count_str
12 | 
13 |     play_count_str = play_count_str.replace('次播放', '')
14 |     play_count_str = play_count_str.replace('播放', '')
15 |     try:
16 |         if '万' in play_count_str:
17 |             play_count_str = play_count_str.split('万')[0]
18 |             if ',' in play_count_str:
19 |                 play_count_str = play_count_str.replace(',', '')
20 |             play_count = int(float(play_count_str) * 1e4)
21 |             return play_count
22 |         elif "w" in play_count_str:
23 |             play_count_str = play_count_str.split('w')[0]
24 |             if ',' in play_count_str:
25 |                 play_count_str = play_count_str.replace(',', '')
26 |             play_count = int(float(play_count_str) * 1e4)
27 |             return play_count
28 |         else:
29 |             try:
30 |                 play_count = int(play_count_str)
31 |             except:
32 |                 play_count = int(play_count_str.replace(',', ''))
33 |             return play_count
34 |     except:
35 |         return None


--------------------------------------------------------------------------------
/crawler_sys/utils/util_logging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Mar 20 10:07:00 2019
 5 | 
 6 | @author: hanye
 7 | """
 8 | import datetime
 9 | import logging
10 | import logging.handlers
11 | from functools import wraps
12 | 
13 | 
14 | def logged(func):
15 |     """
16 |     Decorator to log crawler task.
17 |     """
18 |     @wraps(func)
19 |     def with_logging(*args, **kwargs):
20 |         today_str = datetime.datetime.now().isoformat()[:10]
21 |         log_folder_name = 'crawler_log'
22 |         crawler_pth = '/home/hanye/crawlersNew/crawler'
23 |         LOG_FN = ('crawler_task_%s_%s_log' % (func.__name__, today_str))
24 |         log_fn_abs_path = '/'.join([crawler_pth, log_folder_name, LOG_FN])
25 |         FORMAT = '[%(asctime)s][runningFunction:%(name)s][logModule:%(module)s][pid:%(process)d] %(message)s'
26 |         hy_logger = logging.getLogger(func.__name__)
27 |         hy_logger.setLevel(logging.INFO)
28 |         formatter = logging.Formatter(fmt=FORMAT)
29 |         file_handler = logging.FileHandler(filename=log_fn_abs_path)
30 |         file_handler.setFormatter(formatter)
31 |         file_handler.setLevel(logging.INFO)
32 |         hy_logger.addHandler(file_handler)
33 | 
34 |         hy_logger.info('task starts')
35 |         argstr = ''
36 |         if args:
37 |             argstr += args.__str__()
38 |         if kwargs:
39 |             argstr += kwargs.__str__()
40 |         if argstr:
41 |             hy_logger.info('args:%s' % argstr)
42 | 
43 |         return func(*args, **kwargs)
44 |     return with_logging
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/write_into_database.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Sep 30 10:45:11 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | 
10 | import pymysql
11 | from crawler_sys.proxy_pool import make_up_sql
12 | 
13 | 
14 | def write_dic_into_database(data_dic,
15 |                             host='192.168.18.11',
16 |                             user='root',
17 |                             passwd='csm@1234',
18 |                             database_name='proxy_pool',
19 |                             table_name='proxy_pool'):
20 |     """
21 |     write dict into mysql database
22 |     """
23 |     
24 |     connection = pymysql.connect(host=host, user=user, passwd=passwd,
25 |                                  db=database_name, port=3306,
26 |                                  cursorclass=pymysql.cursors.DictCursor)
27 |     cursor = connection.cursor()
28 | 
29 |     write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=data_dic)
30 |     cursor.execute(write_into_sql)
31 |     connection.commit()
32 |     print('write %s into database' % data_dic['ip_address'])
33 | 
34 | 
35 | def write_lst_into_database(data_lst, 
36 |                             host='192.168.18.11',
37 |                             user='root',
38 |                             passwd='csm@1234',
39 |                             database_name='proxy_pool',
40 |                             table_name='proxy_pool'):
41 |     """
42 |     write list into mysql database
43 |     """
44 |     connection = pymysql.connect(host=host, user=user, passwd=passwd,
45 |                                  db=database_name, port=3306,
46 |                                  cursorclass=pymysql.cursors.DictCursor)
47 |     cursor = connection.cursor()
48 |     length = len(data_lst)
49 | 
50 |     for line in data_lst:
51 |         write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=line)
52 |         cursor.execute(write_into_sql)
53 | 
54 |     connection.commit()
55 |     print('write %s pieces of data into database' % length)


--------------------------------------------------------------------------------
/crawler_sys/utils/write_into_file.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Oct 11 09:04:54 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | 
 9 | import datetime
10 | import json
11 | 
12 | current_date = datetime.datetime.now().isoformat()[:10]
13 | 
14 | def write_str_into_file(file_path,
15 |                         file_name,
16 |                         var):
17 |     with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file:
18 |         file.write(var)
19 |         file.write('\n')
20 | 
21 | def write_dic_into_file(file_path,
22 |                         file_name,
23 |                         var):
24 |     with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file:
25 |         var_json = json.dumps(var)
26 |         file.write(var_json)
27 |         file.write('\n')
28 | 


--------------------------------------------------------------------------------
/crawler_sys/utils/write_lsturl_info_conf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Oct 10 17:06:49 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import os
 9 | import configparser
10 | 
11 | conf_file_path = os.getcwd()
12 | tencent_dic = {"platform": "腾讯视频",
13 |                "channel": {"音乐": "http://v.qq.com/x/list/music",
14 |                            "新闻": "http://v.qq.com/x/list/news",
15 |                            "军事": "http://v.qq.com/x/list/military",
16 |                            "娱乐": "http://v.qq.com/x/list/ent",
17 |                            "体育": "http://v.qq.com/x/list/sports",
18 |                            "游戏": "http://v.qq.com/x/list/games",
19 |                            "搞笑": "http://v.qq.com/x/list/fun",
20 |                            "时尚": "http://v.qq.com/x/list/fashion",
21 |                            "生活": "http://v.qq.com/x/list/life",
22 |                            "母婴": "http://v.qq.com/x/list/baby",
23 |                            "汽车": "http://v.qq.com/x/list/auto",
24 |                            "科技": "http://v.qq.com/x/list/tech",
25 |                            "教育": "http://v.qq.com/x/list/education",
26 |                            "财经": "http://v.qq.com/x/list/finance",
27 |                            "房产": "http://v.qq.com/x/list/house",
28 |                            "旅游": "http://v.qq.com/x/list/travel",
29 |                            "王者荣耀": "http://v.qq.com/x/list/kings"}}
30 | 
31 | target_lst_page_lst = [tencent_dic,]
32 | 
33 | #initialize conf file
34 | config = configparser.ConfigParser()
35 | for platform_dic in target_lst_page_lst:
36 |     config[platform_dic['platform']] = platform_dic['channel']
37 | with open(conf_file_path + '/lst_page_conf.ini',
38 |           'w', encoding='utf-8') as configfile:
39 |     config.write(configfile)
40 |     


--------------------------------------------------------------------------------
/crawler_sys/utils/write_releasers_into_ini_from_csv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Oct 23 10:03:53 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import configparser
 9 | from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline
10 | 
11 | task_list = csv_to_lst_with_headline('F:/add_target_releaser/last_month/zhangminghui2.csv')
12 | 
13 | releaser_dic = {}
14 | for line in task_list:
15 |     releaser_dic[line['releaser']] = line['releaserUrl']
16 | 
17 | 
18 | config = configparser.ConfigParser()
19 | config['haokan'] = releaser_dic
20 | 
21 | with open ('key_customer.ini', 'w', encoding='utf-8') as ini:
22 |     config.write(ini)
23 | 
24 | 
25 | 
26 | #special task
27 | #for line in source_lst:
28 | #    detail_lst = line['detail']
29 | #    csm_mdu = detail_lst[0]['csm_mdu']
30 | #    for detail_dic in detail_lst:
31 | #        detail_dic.pop('csm_mdu')
32 | #    line['csm_mdu'] = csm_mdu


--------------------------------------------------------------------------------
/crawler_sys/utils/write_search_keyword_info_conf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Oct 10 17:06:49 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import os
 9 | import configparser
10 | 
11 | conf_file_path = os.getcwd()
12 | #initialize conf file
13 | config = configparser.ConfigParser()
14 | config['腾讯新闻'] = {'keyword': '看看新闻,看看新闻Knews,Knews'}
15 | with open('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini',
16 |           'w', encoding='utf-8') as configfile:
17 |     config.write(configfile)
18 | print(os.getcwd())
19 | 
20 | config = configparser.ConfigParser()
21 | config.read('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini')


--------------------------------------------------------------------------------
/dev/CCTV_live/bilibili.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/28 16:18 
 3 | # @Author : litao
 4 | # -*- coding:utf-8 -*-
 5 | # @Time : 2020/5/28 15:19
 6 | # @Author : litao
 7 | from selenium import webdriver
 8 | import redis, datetime,time,json
 9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
11 | 
12 | class CrawlerMain(object):
13 |     def __init__(self):
14 |         self.chrome_options = webdriver.ChromeOptions()
15 |         # self.chrome_options.add_argument('--disable-gpu')
16 |         self.chrome_options.add_argument("--start-maximized")
17 |         self.chrome_options.add_argument("--no-sandbox")
18 |         self.chrome_options.add_argument('disable-infobars')
19 |         # 请根据chrome版本 下载chrome driver   http://npm.taobao.org/mirrors/chromedriver/
20 |         self.driver = webdriver.Chrome(options=self.chrome_options)
21 | 
22 |     def login(self):
23 |         self.driver.get("https://live.bilibili.com/21686237?from=search&seid=1739181021049557638")
24 |         while True:
25 |             now = datetime.datetime.now()
26 |             res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']")
27 |             if res:
28 |                 play_count = trans_play_count(res[0].text)
29 |                 print(play_count)
30 |                 dic = {
31 |                         "menber":play_count,
32 |                         "fetch_time":int(now.timestamp()*1e3)
33 |                 }
34 |                 rds.rpush("bilibili", json.dumps(dic))
35 |                 time.sleep(200)
36 | 
37 |     def __exit__(self):
38 |         self.driver.close()
39 | 
40 | if __name__ == "__main__":
41 |     test = CrawlerMain()
42 |     test.login()
43 | 


--------------------------------------------------------------------------------
/dev/CCTV_live/toutiao.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/28 15:19 
 3 | # @Author : litao
 4 | from selenium import webdriver
 5 | import redis, datetime,time,json
 6 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
 7 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
 8 | 
 9 | class CrawlerMain(object):
10 |     def __init__(self):
11 |         self.chrome_options = webdriver.ChromeOptions()
12 |         # self.chrome_options.add_argument('--disable-gpu')
13 |         self.chrome_options.add_argument("--start-maximized")
14 |         self.chrome_options.add_argument("--no-sandbox")
15 |         self.chrome_options.add_argument('disable-infobars')
16 |         # 请根据chrome版本 下载chrome driver   http://npm.taobao.org/mirrors/chromedriver/
17 |         self.driver = webdriver.Chrome(options=self.chrome_options)
18 | 
19 |     def login(self):
20 |         self.driver.get("https://live.ixigua.com/room/6831736034540456716/")
21 |         while True:
22 |             now = datetime.datetime.now()
23 |             res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']")
24 |             if res:
25 |                 play_count = trans_play_count(res[0].text)
26 |                 print(play_count)
27 |                 dic = {
28 |                         "menber":play_count,
29 |                         "fetch_time":int(now.timestamp()*1e3)
30 |                 }
31 |                 rds.rpush("toutiao", json.dumps(dic))
32 |                 time.sleep(200)
33 | 
34 |     def __exit__(self):
35 |         self.driver.close()
36 | 
37 | if __name__ == "__main__":
38 |     test = CrawlerMain()
39 |     test.login()
40 | 


--------------------------------------------------------------------------------
/dev/CCTV_live/yangshipin.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2020/5/28 16:21 
 3 | # @Author : litao
 4 | # -*- coding:utf-8 -*-
 5 | # @Time : 2020/5/28 15:19
 6 | # @Author : litao
 7 | from selenium import webdriver
 8 | import redis, datetime,time,json
 9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
11 | 
12 | class CrawlerMain(object):
13 |     def __init__(self):
14 |         self.chrome_options = webdriver.ChromeOptions()
15 |         # self.chrome_options.add_argument('--disable-gpu')
16 |         self.chrome_options.add_argument("--start-maximized")
17 |         self.chrome_options.add_argument("--no-sandbox")
18 |         self.chrome_options.add_argument('disable-infobars')
19 |         # 请根据chrome版本 下载chrome driver   http://npm.taobao.org/mirrors/chromedriver/
20 |         self.driver = webdriver.Chrome(options=self.chrome_options)
21 | 
22 |     def login(self):
23 |         self.driver.get("https://m.yangshipin.cn/video?type=2&vid=2004011401&pid=600036243&ptag=4_1.4.2.20898_wxf")
24 |         while True:
25 |             now = datetime.datetime.now()
26 |             res = self.driver.find_elements_by_xpath("//span[@class='p-video-intro-person']")
27 |             if res:
28 |                 play_count = trans_play_count(res[0].text)
29 |                 print(play_count)
30 |                 dic = {
31 |                         "menber":play_count,
32 |                         "fetch_time":int(now.timestamp()*1e3)
33 |                 }
34 |                 rds.rpush("toutiao", json.dumps(dic))
35 |                 time.sleep(200)
36 | 
37 |     def __exit__(self):
38 |         self.driver.close()
39 | 
40 | if __name__ == "__main__":
41 |     test = CrawlerMain()
42 |     test.login()
43 | 


--------------------------------------------------------------------------------
/dev/iqiyi_debug/test_aiohttp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec  7 11:02:17 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import aiohttp
 9 | import asyncio
10 | 
11 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html',
12 |              'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html',
13 |              'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html',
14 |              'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html',]
15 | 
16 | 
17 | 
18 | 
19 | async def download_page(session, url):
20 |     get_page = await session.get(url)
21 |     page = await get_page.text("utf-8", errors="ignore")
22 |     return page
23 | 
24 | async def get_list_page(loop):
25 |     async with aiohttp.ClientSession() as list_page_sess:
26 |         task = [loop.create_task(download_page(list_page_sess, url)) for url in task_list]
27 |         done, pending = await asyncio.wait(task)
28 |         result_lst = [d.result() for d in done]
29 |         print(result_lst)
30 | 
31 | loop = asyncio.get_event_loop()
32 | loop.run_until_complete(get_list_page(loop))


--------------------------------------------------------------------------------
/dev/iqiyi_debug/test_iqiyi_list_url.py:
--------------------------------------------------------------------------------
 1 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html',
 2 |              'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html',
 3 |              'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html',
 4 |              'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html',
 5 |              'http://list.iqiyi.com/www/10/1007-------------4-1-2--1-.html',
 6 |              'http://list.iqiyi.com/www/28/-------------4-13-2-iqiyi-1-.html',
 7 |              'http://list.iqiyi.com/www/24/-------------4-23-2-iqiyi--.html',
 8 |              'http://list.iqiyi.com/www/10/1006-------------4-27-2--1-.html',
 9 |              'http://list.iqiyi.com/www/10/1007-------------4-6-2--1-.html',
10 |              'http://list.iqiyi.com/www/10/1006-------------4-4-2--1-.html',
11 |              'http://list.iqiyi.com/www/28/-------------4-14-2-iqiyi-1-.html',
12 |              'http://list.iqiyi.com/www/24/-------------4-5-2-iqiyi--.html',
13 |              'http://list.iqiyi.com/www/8/-------------4-13-2-iqiyi--.html',
14 |              'http://list.iqiyi.com/www/10/1007-------------4-30-2--1-.html',
15 |              'http://list.iqiyi.com/www/8/-------------4-8-2-iqiyi--.html',
16 |              'http://list.iqiyi.com/www/8/-------------4-16-2-iqiyi--.html',
17 |              'http://list.iqiyi.com/www/28/-------------4-9-2-iqiyi-1-.html',
18 |              'http://list.iqiyi.com/www/7/-------------4-23-2-iqiyi-1-.html',
19 |              'http://list.iqiyi.com/www/5/-------------4-10-2-iqiyi--.html',
20 |              'http://list.iqiyi.com/www/17/-------------4-4-2-iqiyi--.html',
21 |              'http://list.iqiyi.com/www/8/-------------4-4-2-iqiyi--.html',
22 |              'http://list.iqiyi.com/www/25/-------------4-3-2-iqiyi-1-.html',
23 |              'http://list.iqiyi.com/www/5/-------------4-18-2-iqiyi--.html',
24 |              'http://list.iqiyi.com/www/3/-------------4-24-2-iqiyi--.html',
25 |              'http://list.iqiyi.com/www/3/-------------4-19-2-iqiyi--.html',
26 |              'http://list.iqiyi.com/www/8/-------------4-21-2-iqiyi--.html',
27 |              'http://list.iqiyi.com/www/5/-------------4-9-2-iqiyi--.html',
28 |              'http://list.iqiyi.com/www/10/1006-------------4-28-2--1-.html',
29 |              'http://list.iqiyi.com/www/5/-------------4-27-2-iqiyi--.html',
30 |              'http://list.iqiyi.com/www/7/-------------4-26-2-iqiyi-1-.html']
31 | 
32 | 


--------------------------------------------------------------------------------
/dev/post_dict.txt:
--------------------------------------------------------------------------------
 1 | {"param":"
 2 | vPCnptKLNQAoHKbY9AUySUt3US0zNbhxzqZyO4IlZhCUZu0jMdEoQOSfkaDq 
 3 | bHw6e3k2kHR1VZB/FL+mJqHkZKYM812xpBR5ZUErEl0if5UxXIj51Ena6OAC 
 4 | LYnl+BHzISJEi0Ne7+2xXr/lUHZM4qDuJz8fd3xYXN2Xtwf32yAprr8c24GO 
 5 | IrLeM4e3fi9phnrfke34CMw8uYUX9wbujIFFG0Ln3pG+V2iPG3iBf654QVeF 
 6 | 3obzc2sOSvD1uCuP6J7jVQbLDphXxeQaXSKHqL5kdScU3QYDCnbYTMiJXq+A 
 7 | EMJ8/V4s+eKwRiWHvmtqcHLiTi81gQyJTQWY4D7v9RMe/RcUJDgvoFMNLBHt 
 8 | 4GQ+ow8QY5t6JLiVH7LsXIjhjF/gYRwn0unLBz0g4J+mGdAdh4+Go8EaJBKV 
 9 | 7fkFn/QaIGsyhyqfNn/fJc/btvCqgvdDeLphjU2WeFaA74DgD4ElDfNflh0T
10 | 50y0/nF8Vc5PQB1zA6R6ipuM5XkEuPLLZ4362p6T+2pm1p8XUF3GTGWteW+/ 
11 | gwLoPxCOUQ6rZ4m+WIF8MQXJWht3JJRY+J5sy15017kKC5HQDxSemBbPokBD 
12 | vX6UGHfqvVgZSRdC83fVp/WtcDIC36y5YPQc01XZ5dfrytfaOkjd/PWCsUnX 
13 | pRkytYh+1OwPnD0j3/JsBlomf3V3STlnjdXY4qC/6isIFXya3D1fn/vrtbVF 
14 | kXga30Jw4nPEnHuQCdwtwxm09ZOxy0/8RNTwov2oErg19Jcxhn6OUs0Ccm5z 
15 | DxVmZtTJpmHTIeLdaYL5r2ONoYciHikj3z6NQ2DrQ3s4AVQTMWayA8V0ENto 
16 | aSI="}


--------------------------------------------------------------------------------
/tasks/check_high_play_count_data_source_v_qq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Sep  5 17:52:53 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq
 9 | from crawler_sys.utils.output_results import output_result
10 | from crawler_sys.utils import Metaorphosis as meta
11 | from crawler_sys.utils.output_log import output_log
12 | 
13 | logging = output_log(page_category='video_page',
14 |                      program_info='tencent')
15 | 
16 | def tran_input_data_to_lst(file_name, file_category='csv'):
17 |     if file_category == 'csv':
18 |         video_info_lst = meta.csv_to_lst_whth_headline(file_name)
19 |         url_lst = []
20 |         for line in video_info_lst:
21 |             try:
22 |                 if line['data_provider'] == 'CCR':
23 |                     url_lst.append(line['url'])
24 |             except:
25 |                 pass
26 |         return url_lst
27 |     elif file_category == 'file':
28 |         url_lst = meta.str_file_to_lst(file_name)
29 |         return url_lst
30 | 
31 | url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file')
32 | 
33 | crawler = Crawler_v_qq()
34 | get_video_page = crawler.video_page
35 | 
36 | def get_data_source(url_lst=url_lst,
37 |                     output_to_file=False,
38 |                     filepath=None,
39 |                     output_to_es_raw=False,
40 |                     output_to_es_register=False,
41 |                     push_to_redis=False,
42 |                     output_es_index=None,
43 |                     output_doc_type=None):
44 |     result_lst = []
45 |     for url in url_lst:
46 |         video_info = get_video_page(url=url)
47 |         result_lst.append(video_info)
48 |         logging.info('get_data at page %s' % url)
49 |         if len(result_lst) >= 100:
50 |             if output_es_index is not None and output_doc_type is not None:
51 |                 output_result(result_lst,
52 |                               platform='腾讯视频',
53 |                               output_to_file=output_to_file,
54 |                               output_to_es_raw=output_to_es_raw,
55 |                               output_to_es_register=output_to_es_register,
56 |                               push_to_redis=push_to_redis,
57 |                               es_index=output_es_index,
58 |                               doc_type=output_doc_type)
59 |                 result_lst.clear()
60 |             else:
61 |                 output_result(result_lst,
62 |                               platform='腾讯视频',
63 |                               output_to_file=output_to_file,
64 |                               output_to_es_raw=output_to_es_raw,
65 |                               output_to_es_register=output_to_es_register,
66 |                               push_to_redis=push_to_redis)
67 |                 result_lst.clear()
68 |     if len(result_lst) != []:
69 |         if output_es_index is not None and output_doc_type is not None:
70 |             output_result(result_lst,
71 |                           platform='腾讯视频',
72 |                           output_to_file=output_to_file,
73 |                           output_to_es_raw=output_to_es_raw,
74 |                           output_to_es_register=output_to_es_register,
75 |                           push_to_redis=push_to_redis,
76 |                           es_index=output_es_index,
77 |                           doc_type=output_doc_type)
78 |             result_lst.clear()
79 |         else:
80 |             output_result(result_lst,
81 |                           platform='腾讯视频',
82 |                           output_to_file=output_to_file,
83 |                           output_to_es_raw=output_to_es_raw,
84 |                           output_to_es_register=output_to_es_register,
85 |                           push_to_redis=push_to_redis)
86 |             result_lst.clear()
87 | 
88 | if __name__ == '__main__':
89 |     get_data_source(output_to_es_raw=True,
90 |                     output_es_index='test2',
91 |                     output_doc_type='fyc')


--------------------------------------------------------------------------------
/tasks/crontab/192.168.18.11/etc/crontab:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/bash
 2 | PATH=/sbin:/bin:/usr/sbin:/usr/bin
 3 | MAILTO=hanye
 4 | HOME=/
 5 | 
 6 | # For details see man 4 crontabs
 7 | 
 8 | # Example of job definition:
 9 | # .---------------- minute (0 - 59)
10 | # |  .------------- hour (0 - 23)
11 | # |  |  .---------- day of month (1 - 31)
12 | # |  |  |  .------- month (1 - 12) OR jan,feb,mar,apr ...
13 | # |  |  |  |  .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
14 | # |  |  |  |  |
15 | # *  *  *  *  * user-name command to be executed
16 | 
17 | # 1 update video data in target release index daily
18 |   0  0  *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -n 10 -s 10
19 |   0  0  *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -n 30 -s 10
20 |   0  0  *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p iqiyi -n 30 -s 10
21 |   0  0  *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -n 10 -s 10
22 | 
23 | # 2 write crawled data into short-video-prodtion index daily
24 |   0  05 *  *  *  hanye python3 /home/hanye/crawlers/tasks/update_DU_ATU_from_crawler_raw.py
25 | 
26 | # 3 create redis url batch
27 |   0  18 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py  -p iqiyi -p 腾讯视频 -b 02 -d 30
28 |   0  10 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py  -p iqiyi -p 腾讯视频 -b 02 -d 30
29 | 
30 | # 4 scrap redis url list
31 |   0  03 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p 腾讯视频 -b 02
32 |   0  03 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p iqiyi -b 02
33 |   0  03 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages.py -p youku -n 30
34 | 
35 | # 5 scrap list pages
36 |   0  10 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p iqiyi -n 30
37 |   0  10 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p youku -n 30
38 |   0  10 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p 腾讯视频 -n 30
39 |   0 7,10,15 *  *  *  hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p toutiao -n 160 -s 10
40 | 
41 | # update haokan target releaser
42 |   0 0,8,12,18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_single_thread.py -p haokan -n 20
43 | 
44 | # get tencent news search page
45 |   0 0,4,8,10,14,16,20 * * *  hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/search_page_single_process.py -p 腾讯新闻
46 | 
47 | #high frequency releasers ,crawler executes every hour, only for haokan currently
48 |  0 0,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 * * *  hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/high_fre_releasers.py -p haokan
49 | 
50 | #high frequency releasers. execute on 1 pm and 6pm
51 |  0 0,13,18 * * *  hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -fre 3 -n 20 -s 15
52 |  0 0,13,18 * * *  hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频  -fre 3 -n 20 -s 15
53 |  0 0,13,18 * * *  hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou  -fre 3 -n 20 -s 15
54 | 


--------------------------------------------------------------------------------
/tasks/download_video_page.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec 14 15:03:05 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 | 
13 | 
14 | """
15 | only for platform v_qq, iqiyi, and new_tudou
16 | """
17 | 
18 | step = 'download_video_page'
19 | data_cate = 'video_url'
20 | 
21 | def download_video_page(platform):
22 |     crawler_initialization = get_crawler(platform)
23 |     crawler = crawler_initialization()
24 |     if platform == '腾讯视频':
25 |         key = 'v_qq_url_dict'
26 |     else:
27 |         key= "%s_%s" % (platform, data_cate)
28 |     while True:
29 |         if connect_with_redis.length_of_set(key) > 0:
30 | #            pid_num = connect_with_redis.length_of_set(key=key_set)
31 | #            if pid_num < 20:
32 | #                process_num = int(20-pid_num)
33 |             crawler.download_video_page_async_multi_process()
34 | #            else:
35 | #                print("%s processes is working on %s" % (pid_num, platform))
36 | #                time.sleep(20)
37 |         else:
38 |             print("no %s url [dict] in redis" % platform)
39 |             time.sleep(300)
40 | 
41 | v_qq = Process(target=download_video_page, args=('腾讯视频',))
42 | iqiyi = Process(target=download_video_page, args=('iqiyi',))
43 | new_tudou = Process(target=download_video_page, args=('new_tudou',))
44 | 
45 | v_qq.start()
46 | iqiyi.start()
47 | new_tudou.start()


--------------------------------------------------------------------------------
/tasks/fix_album_play_count_by_getting_releaser_page.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 23 13:48:33 2018
 4 | 
 5 | input video url, output video play_count gotten from releaser page
 6 | 
 7 | @author: fangyucheng
 8 | """
 9 | 
10 | from elasticsearch import Elasticsearch
11 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
12 | from crawler.crawler_sys.utils import trans_format
13 | from crawler.crawler_sys.site_crawler import crawler_v_qq
14 | 
15 | absolute_file_path = r"C:\Users\zhouyujiang\安徽第一周数据情况.csv"
16 | task_list = trans_format.str_file_to_lst(absolute_file_path)
17 | 
18 | result_lst = []
19 | crawler = crawler_v_qq.Crawler_v_qq()
20 | for url in task_list:
21 |     get_data = crawler.video_page(url)
22 |     result_lst.append(get_data)
23 |     print("get data at %s" % url)
24 | 
25 | bug_releaser_list = []
26 | releaserUrl_lst = []
27 | revised_lst = []
28 | for line in result_lst:
29 |     try:
30 |         if line['releaserUrl'] is not None:
31 |             releaserUrl = line['releaserUrl']
32 |             if releaserUrl not in releaserUrl_lst:
33 |                 releaserUrl_lst.append(releaserUrl)
34 |                 try:
35 |                     crawler.releaser_page(releaserUrl, output_to_es_raw=True,
36 |                                           es_index='test2', doc_type='12zjbfl',
37 |                                           releaser_page_num_max=1000)
38 |                     print ("get releaser data at %s" % releaserUrl)
39 |                 except:
40 |                     bug_releaser_list.append(releaserUrl)
41 |             else:
42 |                 pass
43 |         else:
44 |             print("this video %s can't find releaser" % line['url'])
45 |     except:
46 |         print("can't get releaser at %s" % url)
47 | 
48 | hosts = '192.168.17.11'
49 | port = 80
50 | user_id = 'fangyucheng'
51 | password = 'VK0FkWf1fV8f'
52 | http_auth = (user_id, password)
53 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
54 | 
55 | unsolve_lst = []
56 | result_lst2 =[]
57 | 
58 | for line in task_list:
59 |     try:
60 |         if type(line) == dict:
61 |             url = line['url']
62 |         elif type(line) == str:
63 |             url = line
64 |         search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
65 |         search = es_connection.search(index="test2", doc_type="12zjbfl", body=search_body)
66 |         if search["hits"]["total"] == 0:
67 |             unsolve_lst.append(url)
68 |             print("can not get video data at %s" % url)
69 |         else:
70 |             video_data = search["hits"]["hits"][0]["_source"]
71 |             result_lst2.append(video_data)
72 |             print("get playcount at %s" % url)
73 |     except:
74 |         pass
75 | 
76 | lst_to_csv(listname=result_lst2,
77 |            csvname=r"C:\Users\zhouyujiang\12121212121.csv")


--------------------------------------------------------------------------------
/tasks/follower_num.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/follower_num.csv


--------------------------------------------------------------------------------
/tasks/from_es_find_crawler_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/21 14:33 
3 | # @Author : litao
4 | 


--------------------------------------------------------------------------------
/tasks/parse_list_page.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec 14 10:32:28 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 | 
13 | 
14 | """
15 | only for platform v_qq, iqiyi and youku
16 | """
17 | 
18 | step = 'parse_list_page'
19 | data_cate = 'list_page_html'
20 | 
21 | def parse_list_page(platform):
22 |     crawler_initialization = get_crawler(platform)
23 |     crawler = crawler_initialization()
24 |     if platform == '腾讯视频':
25 |         platform = 'v_qq'
26 |     key_lst = "%s_%s" % (platform, data_cate)
27 | #    key_set = "%s_%s" % (platform, step)
28 |     while True:
29 |         if connect_with_redis.length_of_lst(key=key_lst) > 0:
30 |             crawler.parse_list_page_multi_process()
31 |         else:
32 |             print("no %s list page html in redis" % platform)
33 |             time.sleep(300)
34 | 
35 | v_qq = Process(target=parse_list_page, args=('腾讯视频',))
36 | iqiyi = Process(target=parse_list_page, args=('iqiyi',))
37 | youku = Process(target=parse_list_page, args=('youku',))
38 | 
39 | v_qq.start()
40 | iqiyi.start()
41 | youku.start()
42 | 


--------------------------------------------------------------------------------
/tasks/parse_video_page.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Dec 14 15:23:29 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import time
 9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 | 
13 | """
14 | for platform v_qq, iqiyi, and new_tudou
15 | """
16 | 
17 | kwarg_dict = {'output_to_es_raw': True,
18 |               'es_index': 'crawler-data-raw',
19 |               'doc_type': 'doc',
20 |               'output_to_es_register': True}
21 | 
22 | step = 'parse_video_page'
23 | data_cate = 'video_page_html'
24 | 
25 | def parse_video_page(platform, para_dic):
26 |     crawler_initialization = get_crawler(platform)
27 |     crawler = crawler_initialization()
28 |     if platform == '腾讯视频':
29 |         platform = 'v_qq'
30 |     key= "%s_%s" % (platform, data_cate)
31 |     while True:
32 |         if connect_with_redis.length_of_lst(key) > 0:
33 |             crawler.parse_video_page_multi_process(para_dic)
34 |         else:
35 |             print("no %s video page html in redis" % platform)
36 |             time.sleep(300)
37 | 
38 | v_qq = Process(target=parse_video_page, args=('腾讯视频', kwarg_dict))
39 | iqiyi = Process(target=parse_video_page, args=('iqiyi', kwarg_dict))
40 | new_tudou = Process(target=parse_video_page, args=('new_tudou', kwarg_dict))
41 | 
42 | v_qq.start()
43 | iqiyi.start()
44 | new_tudou.start()


--------------------------------------------------------------------------------
/tasks/produce_releaser_report_from_es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/4/24 17:51 
 3 | # @Author : litao
 4 | # 提供账号和平台,生成数据报告
 5 | # 数据维度：发布量、播放量、粉丝量、评论量、点赞量，视频url、时长、发布时间、发布账号
 6 | 
 7 | # import time
 8 | import json
 9 | # import argparse
10 | import datetime
11 | from elasticsearch import Elasticsearch
12 | import pandas as pd
13 | from elasticsearch.helpers import scan
14 | from func_find_week_num import find_week_belongs_to
15 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
16 | from crawler.crawler_sys.utils import trans_format
17 | from func_cal_doc_id import cal_doc_id
18 | 
19 | hosts = '192.168.17.11'
20 | port = 80
21 | user = 'zhouyujiang'
22 | passwd = '8tM9JDN2LVxM'
23 | http_auth = (user, passwd)
24 | 
25 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
26 | 
27 | 
28 | def func_search_reUrl_from_target_index(platform, releaser):
29 |     search_body = {
30 |         "query": {
31 |             "bool": {
32 |                 "filter": [
33 |                     {"term": {"platform.keyword": platform}},
34 |                     {"term": {"releaser.keyword": releaser}}
35 |                 ]
36 |             }
37 |         }
38 |     }
39 |     search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
40 |     if search_re['hits']['total'] > 0:
41 |         return search_re['hits']['hits'][0]['_source']['releaserUrl']
42 |     else:
43 |         print('Can not found:', platform, releaser)
44 |         return None
45 | 
46 | data_dic = {}
47 | miaopai_list = []
48 | file = r'D:\work_file\无锡台内容数据需求.csv'
49 | with open(file, 'r')as f:
50 |     header_Lst = f.readline().strip().split(',')
51 |     for line in f:
52 |         line_Lst = line.strip().split(',')
53 |         line_dict = dict(zip(header_Lst, line_Lst))
54 |         releaser = line_dict['releaser']
55 |         platform = line_dict['platform']
56 |         # releaserUrl = func_search_reUrl_from_target_index(platform, releaser)
57 |         releaserUrl = 1
58 |         if releaserUrl != None:
59 |             re_list = []
60 |             search_body = {
61 |                 "query": {
62 |                     "bool": {
63 |                         "filter": [
64 |                             {"term": {"platform.keyword": platform}}, {"term": {"releaser.keyword": releaser}},
65 |                             {"range": {"release_time": {"gte": 1546272000000, "lt": 1554048000000}}},
66 |                             {"range": {"fetch_time": {"gte": 1556150400000}}}
67 |                         ]
68 |                     }
69 |                 }
70 |             }
71 | 
72 |             scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',
73 |                            query=search_body, scroll='3m')
74 |             for one_scan in scan_re:
75 |                 "发布者,平台,标题,url,播放量,点赞量,评论量,时长,发布时间"
76 |                 data_dic[cal_doc_id(platform, url=one_scan["_source"]["url"], doc_id_type='all-time-url')]=[one_scan["_source"]["releaser"],one_scan["_source"]["platform"],one_scan["_source"]["title"],one_scan["_source"]["url"],one_scan["_source"]["play_count"],one_scan["_source"]["favorite_count"],one_scan["_source"]["comment_count"],one_scan["_source"]["duration"],datetime.datetime.fromtimestamp(one_scan["_source"]["release_time"]/1000).strftime('%Y-%m-%d %H:%M:%S')]
77 | data_lis = []
78 | print(len(data_dic))
79 | for d in data_dic:
80 |     data_lis.append(data_dic[d])
81 | 
82 | data = pd.DataFrame(data_lis)
83 | data.to_csv('./%s.csv' % "无锡台内容数据需求2", encoding="ansi")
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/tasks/sync_mayun_code_to_100.97.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # @Time : 2019/9/29 11:53 
 3 | # @Author : litao
 4 | import subprocess,time
 5 | 
 6 | f = open("/home/hanye/crawlersNew/crawler/tasks/log", "w", encoding="utf-8")
 7 | while True:
 8 |     try:
 9 |         cmd0 = r'git pull origin master'
10 |         cmd1 = r'git push gitLab master'
11 |         cmdpath = r'/home/hanye/crawlersNew/v-scope2'
12 |         process = subprocess.Popen(cmd0, shell=True, cwd=cmdpath)
13 |         process.wait()
14 |         result = process.returncode
15 |         print(result,file=f)
16 |         process = subprocess.Popen(cmd1, shell=True, cwd=cmdpath)
17 |         process.wait()
18 |         result = process.returncode
19 |         print(result,file=f)
20 |         #time.sleep(43200)
21 |         time.sleep(70)
22 |     except Exception as e:
23 |         f.write("eroor",e)
24 |         f.flush()
25 | 
26 | f.close()
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/tasks/the_key_of_pid_in_redis.txt:
--------------------------------------------------------------------------------
1 | [parse_list_page]
2 | v_qq = v_qq_parse_list_page
3 | youku = youku_parse_list_page


--------------------------------------------------------------------------------
/tasks/yangjingshu.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/yangjingshu.csv


--------------------------------------------------------------------------------
/tasks/zhangminghui_dec.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec.csv


--------------------------------------------------------------------------------
/tasks/zhangminghui_dec2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec2.csv


--------------------------------------------------------------------------------
/test/qq_news/remove_unrelated_parameter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Nov 26 13:14:03 2018
  4 | 
  5 | @author: fangyucheng
  6 | """
  7 | 
  8 | import time
  9 | import urllib
 10 | import requests
 11 | 
 12 | domain = 'http://r.inews.qq.com/searchMore?'
 13 | #domain_simple = 'http://r.inews.qq.com/searchMore'
 14 | headers = {"Host": "r.inews.qq.com",
 15 |            "Accept-Encoding": "gzip,deflate",
 16 |            "Referer": "http://inews.qq.com/inews/android/",
 17 |            "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)",
 18 |            "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;",
 19 |            "Connection": "Keep-Alive"}
 20 | 
 21 | url_dic = {'isoem':'0',
 22 |            'mid': '74b9305504a047ab0a1901e2dfbf71f87f799819',
 23 |            'dpi': '270',
 24 |            'devid': '008796749793280',
 25 |            'is_chinamobile_oem': '0',
 26 |            'mac': 'mac%20unknown',
 27 |            'real_device_width':'5.06',
 28 |            'store': '17',
 29 |            'screen_height': '1440',
 30 |            'real_device_height': '9.0',
 31 |            'apptype': 'android',
 32 |            'origin_imei': '008796749793280',
 33 |            'orig_store': '17',
 34 |            'hw': 'etease_MuMu',
 35 |            'appver': '23_android_5.4.10',
 36 |            'uid': '54767d8bf41ac9a4',
 37 |            'screen_width':'810',
 38 |            'sceneid':'',
 39 |            'omgid': '818b2ebf4abcec4bc1c8bf737a1c131dede60010213210',
 40 |            'timeline':'1540802766',
 41 |            'query':'espn',
 42 |            'activefrom': 'icon',
 43 |            'qqnetwork': 'wifi',
 44 |            'rom_type': '',
 45 |            'secId': '2',
 46 |            'Cookie':'lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;',
 47 |            'network_type': 'wifi',
 48 |            'id': '20181029A18Y3H00',
 49 |            'global_info': '1|0|0|0|1|1|1|1|0|6|1|1|1|1|0|J060P000000000:B054P000011803|1402|0|0|-1|-1|0|0|0||-1|-1|0|0|1|1|0|0|-1|0|2|0|2|0|0|0|0|0|0|0|0|2|0|0|0|0',
 50 |            'imsi_history':'0,460013199570862',
 51 |            'omgbizid': 'a520b26ce7880445ab488481e3dd4949c74f0050213210',
 52 |            'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47',
 53 |            'qn-sig': 'c022edce8ae72f053304412f13a9bb88',
 54 |            'page': "2",
 55 |            'type': "0",
 56 |            'imsi': '460013199570862'}
 57 | 
 58 | #sig = 'c022edce8ae72f053304412f13a9bb88'
 59 | 
 60 | url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic)
 61 | 
 62 | """
 63 | raw-url
 64 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862'
 65 | """
 66 | 
 67 | get_page = requests.get(url, headers=headers)
 68 | page = get_page.text
 69 | 
 70 | key_lst = []
 71 | para_lst = []
 72 | 
 73 | for key, value in url_dic.items():
 74 |     key_lst.append(key)
 75 | 
 76 | for key in key_lst:
 77 |     value = url_dic[key]
 78 |     url_dic.pop(key)
 79 |     url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic)
 80 |     get_page = requests.get(url, headers=headers)
 81 |     page = get_page.text
 82 |     time.sleep(10)
 83 |     if len(page) > 5000:
 84 |         print("%s can be moved from url, length of page is %s" % (key, len(page)))
 85 |         continue
 86 |     else:
 87 |         url_dic[key] = value
 88 |         print("key %s, value %s can't be moved from url" % (key, value))
 89 | 
 90 | #this the result
 91 | final_url_dic = {'devid': '008796749793280',
 92 |                  'appver': '23_android_5.4.10',
 93 |                  'query': 'espn',
 94 |                  'qn-rid': '3e3cb605-3a00-412d-8b2e-f81b32f5064c',
 95 |                  'qn-sig': '589a406e354aa3bfb6ddeaa778278ef1'}
 96 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic)
 97 | 
 98 | """
 99 | final_url
100 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88'
101 | """
102 | 
103 | qn_sig = "01552dc75351b12d01b5310441a562e2"
104 | qn_rid = "8f14a809-db96-4762-905a-7c063dc84ac7"
105 | 


--------------------------------------------------------------------------------
/test/test_read_config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Dec  4 16:23:48 2018
 4 | 
 5 | @author: fangyucheng
 6 | """
 7 | 
 8 | import configparser
 9 | 
10 | config = configparser.ConfigParser()
11 | config.read('/home/hanye/crawlerNEW/crawler/crawler_sys/framework/config/search_keywords.ini')
12 | 


--------------------------------------------------------------------------------