├── .gitignore
├── .idea
├── crawler.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── crawler_log
└── log.txt
├── crawler_sys
├── __init__ .py
├── framework
│ ├── __init__.py
│ ├── config
│ │ ├── high_fre.ini
│ │ ├── key_customer.ini
│ │ ├── legal_platforms.ini
│ │ ├── list_page_urls.ini
│ │ ├── redis_key_register.ini
│ │ ├── search_keywords - 副本.ini
│ │ └── search_keywords.ini
│ ├── update_data_in_redis_multi_process_auto_task.py
│ ├── update_data_in_target_releasers_single_thread.py
│ ├── video_fields_std.py
│ ├── write_releaserUrl_to_crawler_redis.py
│ └── write_releasers_to_redis.py
├── hot_words_crawler
│ ├── __init__.py
│ ├── crawler_douyin.py
│ ├── crawler_haokan.py
│ ├── crawler_kwai.py
│ ├── crawler_main_task.py
│ ├── crawler_new_tudou.py
│ ├── crawler_qq_news.py
│ ├── crawler_toutiao.py
│ ├── crawler_v_qq.py
│ └── crawler_wangyi_news.py
├── proxy_pool
│ └── func_get_proxy_form_kuaidaili.py
├── site_crawler
│ ├── __init__.py
│ ├── chromedriver
│ ├── crawler.new_tudouAPP.py
│ ├── crawler.pepper.py
│ ├── crawler_Mango.py
│ ├── crawler_baidu.py
│ ├── crawler_baijiahao.py
│ ├── crawler_bilibili.py
│ ├── crawler_bing.py
│ ├── crawler_douyin.py
│ ├── crawler_haokan.py
│ ├── crawler_iqiyi.py
│ ├── crawler_kwai.py
│ ├── crawler_mango.py
│ ├── crawler_miaopai.py
│ ├── crawler_pear.py
│ ├── crawler_peoplevideo.py
│ ├── crawler_shenma.py
│ ├── crawler_sogou.py
│ ├── crawler_tencent_news.py
│ ├── crawler_toutiao.py
│ ├── crawler_tudou.py
│ ├── crawler_v_qq.py
│ ├── crawler_v_qq_doc.py
│ ├── crawler_wangyi_news.py
│ ├── crawler_watermelon.py
│ ├── crawler_weibo
│ │ ├── cookie_pool
│ │ ├── crawler_weibo.py
│ │ ├── ghostdriver.log
│ │ └── login.py
│ ├── crawler_ximalaya.py
│ ├── crawler_xinhua.py
│ ├── crawler_youku.py
│ ├── func_get_releaser_id.py
│ └── toutiao_get_signature.py
├── site_crawler_by_redis
│ ├── __init__.py
│ ├── chromedriver
│ ├── crawler_baijiahao.py
│ ├── crawler_douyin.py
│ ├── crawler_haokan.py
│ ├── crawler_kwai.py
│ ├── crawler_tencent_news.py
│ ├── crawler_toutiao.py
│ ├── crawler_tudou.py
│ ├── crawler_v_qq.py
│ ├── crawler_wangyi_news.py
│ └── toutiao_article.py
├── site_crawler_test
│ ├── __init__.py
│ ├── crawler_baijiahao.py
│ ├── crawler_douyin.py
│ ├── crawler_haokan.py
│ ├── crawler_kwai.py
│ ├── crawler_tencent_news.py
│ ├── crawler_toutiao.py
│ ├── crawler_tudou.py
│ ├── crawler_v_qq.py
│ └── crawler_wangyi_news.py
├── special_propose
│ ├── __init__.py
│ ├── crawler.v_qq_comment.py
│ ├── crawler.v_qq_danmu.py
│ ├── eastnews.py
│ ├── get_weibo_user_info.py
│ ├── tocsv.py
│ ├── watcheast.py
│ └── whether_user_exists.py
├── tools
│ ├── add_releasers_video_data.py
│ ├── add_releasers_video_data_multi_process.py
│ ├── add_search_keywords_to_es.py
│ ├── add_target_releasers_by_file.py
│ ├── count_releaser_publish_num_and_refresh_crawler_frequency.py
│ ├── craw_data_and_write_into_alltime_index.py
│ ├── craw_data_and_write_into_alltime_index_by_date.py
│ ├── craw_data_and_write_into_monthly_index.py
│ ├── craw_data_and_write_into_monthly_index_remove_duplication.py
│ ├── craw_data_and_write_into_weekly_index.py
│ ├── craw_data_and_write_into_weekly_index_remove_duplication.py
│ ├── crawler_url_video_info.py
│ ├── error.log
│ ├── find_high_fre_releaser.py
│ ├── match_play_count_by_video_title.py
│ ├── match_play_count_by_video_url.py
│ ├── ocr_by_aliyun.py
│ ├── ocr_by_img.py
│ ├── tudou_selenium.py
│ ├── video_num_count.rar
│ ├── video_num_count
│ │ ├── .idea
│ │ │ ├── .gitignore
│ │ │ ├── misc.xml
│ │ │ ├── modules.xml
│ │ │ ├── vcs.xml
│ │ │ └── video_num_count.iml
│ │ ├── 2019-09-10.csv
│ │ ├── count.csv
│ │ ├── count_main.py
│ │ ├── crawler
│ │ │ ├── 2019-09-10.csv
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ ├── crawler_tencent_news.cpython-37.pyc
│ │ │ │ ├── crawler_wangyi_news.cpython-37.pyc
│ │ │ │ ├── func_get_releaser_id.cpython-37.pyc
│ │ │ │ ├── haokan_count.cpython-37.pyc
│ │ │ │ ├── qq_video_count.cpython-37.pyc
│ │ │ │ ├── toutiao_count.cpython-37.pyc
│ │ │ │ ├── trans_duration_str_to_second.cpython-37.pyc
│ │ │ │ ├── trans_strtime_to_timestamp.cpython-37.pyc
│ │ │ │ └── tudou_count.cpython-37.pyc
│ │ │ ├── crawler_tencent_news.py
│ │ │ ├── crawler_wangyi_news.py
│ │ │ ├── func_get_releaser_id.py
│ │ │ ├── haokan_count.py
│ │ │ ├── haokan_青春旅社.csv
│ │ │ ├── qq_video_count.py
│ │ │ ├── toutiao_count.py
│ │ │ ├── trans_duration_str_to_second.py
│ │ │ ├── trans_strtime_to_timestamp.py
│ │ │ ├── tudou_count.py
│ │ │ └── 首次运行.bat
│ │ └── haokan_看看新闻Knews.csv
│ ├── write_data_to_targert_doc.py
│ ├── write_feihua_json_to_es.py
│ └── 头条重复数据导出.py
└── utils
│ ├── __init__ .py
│ ├── connect_with_es.py
│ ├── connect_with_redis.py
│ ├── date_calculator.py
│ ├── delete_repeat_data.py
│ ├── extract_data_from_es.py
│ ├── fakechrome.py
│ ├── fix_album_play_count_by_releaser_page.py
│ ├── func_verification_code.py
│ ├── get_toutiao_as_cp_signature.py
│ ├── output_log.py
│ ├── output_results.py
│ ├── parse_bool_for_args.py
│ ├── releaser_url_check.py
│ ├── remove_unrelated_parameter_tudou.py
│ ├── trans_duration_str_to_second.py
│ ├── trans_format.py
│ ├── trans_str_play_count_to_int.py
│ ├── trans_strtime_to_timestamp.py
│ ├── util_logging.py
│ ├── write_into_database.py
│ ├── write_into_file.py
│ ├── write_lsturl_info_conf.py
│ ├── write_releasers_into_ini_from_csv.py
│ └── write_search_keyword_info_conf.py
├── dev
├── CCTV_live
│ ├── bilibili.py
│ ├── toutiao.py
│ └── yangshipin.py
├── Documentary_project
│ ├── bilibili.py
│ ├── iqiyi.py
│ ├── tengxun.py
│ └── youku.py
├── Movies_rank_project
│ ├── crawler_douban.py
│ └── crawler_maoyan.py
├── iqiyi_debug
│ ├── test_aiohttp.py
│ └── test_iqiyi_list_url.py
└── post_dict.txt
├── tasks
├── check_high_play_count_data_source_v_qq.py
├── crontab
│ └── 192.168.18.11
│ │ └── etc
│ │ └── crontab
├── download_video_page.py
├── fix_album_play_count_by_getting_releaser_page.py
├── follower_num.csv
├── from_es_find_crawler_task.py
├── parse_list_page.py
├── parse_video_page.py
├── produce_releaser_report_from_es.py
├── sync_mayun_code_to_100.97.py
├── the_key_of_pid_in_redis.txt
├── update_DU_ATU_from_crawler_raw.py
├── write_key_releaser_to_week_doc_weekly.py
├── yangjingshu.csv
├── zhangminghui_dec.csv
└── zhangminghui_dec2.csv
└── test
├── qq_news
└── remove_unrelated_parameter.py
└── test_read_config.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /crawler_sys/framework/check_wrong_url_daily.py
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/.idea/crawler.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # crawler
2 | 爬虫项目:
3 | 主要爬取抖音,好看,快手,头条,土豆,网易新闻,qq视频等短视频数据
4 |
5 | 爬虫主要逻辑代码在
6 | ~~~
7 | crawler_sys/site_crawler_by_redis
8 | ~~~
9 |
10 | 调度工具
11 | ~~~
12 | crawler_sys/framework
13 | ~~~
14 |
--------------------------------------------------------------------------------
/crawler_log/log.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_log/log.txt
--------------------------------------------------------------------------------
/crawler_sys/__init__ .py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue June 6 18:02:43 2018
4 |
5 | @author: hanye
6 | """
7 |
8 |
--------------------------------------------------------------------------------
/crawler_sys/framework/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:44 2018
4 |
5 |
6 | @author: hanye
7 | """
8 |
--------------------------------------------------------------------------------
/crawler_sys/framework/config/high_fre.ini:
--------------------------------------------------------------------------------
1 | [haokan]
2 | 看看新闻knews = https://haokan.baidu.com/haokan/wiseauthor?app_id=1565285080839434
3 | 一手video = https://haokan.baidu.com/haokan/wiseauthor?app_id=1546617034936582
4 | 生活欢乐汇 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462926329612
5 | 消息直升机 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022360094300
6 | 万物杂谈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022359827373
7 | 横漂一姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611821585605765
8 | 吃瓜少女萌 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822155649253
9 | 人人视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1563996947927117
10 | 追剧小师妹 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822302181315
11 | 新娱乐萌主 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611553722549281
12 | 探剧全能王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1610928916930156
13 | 青春影剧场 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822943891552
14 | 肥仔电影圈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1601813907799169
15 | 芒果tv = https://haokan.baidu.com/haokan/wiseauthor?app_id=1549963812551792
16 | 科技观察猿 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462927568986
17 | 撩剧大师 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1607132398047134
18 | 欧阳华北 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1547961620896856
19 | 澎湃新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1574072008111128
20 | 娱乐不晚点 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022346878545
21 | 看剧小资姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594361771699213
22 | 热剧宅急送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353698233619
23 | 毒舌影视街 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195426989529
24 | 江西网络广播电视台 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1577139229517527
25 | 热剧乐翻天 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594362729953997
26 | 全球视频大魔王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1597149386529756
27 | 精彩剧集大放送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593533634618523
28 | 影视水煮鱼 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594257498968349
29 | 财经新鲜事 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1555591483551002
30 | 酷哥撩大剧 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593525911983865
31 | 咸鱼说片 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594356024003023
32 | 安徽海豚播报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611201539330357
33 | 看剧大球球 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593542564661281
34 | 长沙政法频道 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1552122607183011
35 | 体坛先锋报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022347820284
36 | 综艺杂货铺 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195671591267
37 | 视频展览馆 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195775370668
38 | 钱江视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1602058994708441
39 | 动漫铲屎官 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353045050780
40 | 荔枝新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1558731555412280
41 | 武侠超新世界 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1613833957799396
--------------------------------------------------------------------------------
/crawler_sys/framework/config/key_customer.ini:
--------------------------------------------------------------------------------
1 | [haokan]
2 | 经视大直播 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610033451422491
3 | 湖北卫视资讯站 = https://haokan.hao123.com/haokan/wiseauthor?app_id=3513
4 | 湖北经视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1573243419235544
5 | 湖北卫视长江新闻号 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1588754674509461
6 | 湖北卫视非正式会谈 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609570087894225
7 | 非正式课堂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1596696077980564
8 | 大王小湖北卫视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609563211193403
9 | 长江云 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1604613511192710
10 | 调解面对面life = https://haokan.hao123.com/haokan/wiseauthor?app_id=1578861535741379
11 | 我为喜剧疯狂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610026230407548
12 | 湖北调解现场 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609575047119026
--------------------------------------------------------------------------------
/crawler_sys/framework/config/legal_platforms.ini:
--------------------------------------------------------------------------------
1 | [legal_platforms_to_update_production]
2 | legal_platforms = 腾讯新闻,kwai,miaopai,new_tudou,toutiao,haokan,腾讯视频,网易新闻,pearvideo,央视新闻+,人民日报,看了吗,youtube,facebook,新华社,youku,iqiyi,,bilibili,抖音,toutiao_microheadlines,toutiao_article
3 |
--------------------------------------------------------------------------------
/crawler_sys/framework/config/list_page_urls.ini:
--------------------------------------------------------------------------------
1 | [new_tudou]
2 | 推荐 = http://www.tudou.com/api/getfeeds?secCateId=10016&utdid=T8v9EQPOimUCAXL%2FAz0YrDOB&page_size=24
3 | 乐活 = http://www.tudou.com/api/getfeeds?secCateId=10195&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
4 | 搞笑 = http://www.tudou.com/api/getfeeds?secCateId=622736331&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
5 | 影视 = http://www.tudou.com/api/getfeeds?secCateId=622769673&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
6 | 动漫 = http://www.tudou.com/api/getfeeds?secCateId=10116&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
7 | 身边 = http://www.tudou.com/api/getfeeds?secCateId=622621940&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
8 | 综娱 = http://www.tudou.com/api/getfeeds?secCateId=10198&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
9 | 音乐 = http://www.tudou.com/api/getfeeds?secCateId=622336449&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
10 | 游戏 = http://www.tudou.com/api/getfeeds?secCateId=10051&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
11 |
12 | [腾讯视频]
13 | 音乐 = http://v.qq.com/x/list/music
14 | 新闻 = http://v.qq.com/x/list/news
15 | 军事 = http://v.qq.com/x/list/military
16 | 娱乐 = http://v.qq.com/x/list/ent
17 | 体育 = http://v.qq.com/x/list/sports
18 | 游戏 = http://v.qq.com/x/list/games
19 | 搞笑 = http://v.qq.com/x/list/fun
20 | 时尚 = http://v.qq.com/x/list/fashion
21 | 生活 = http://v.qq.com/x/list/life
22 | 母婴 = http://v.qq.com/x/list/baby
23 | 汽车 = http://v.qq.com/x/list/auto
24 | 科技 = http://v.qq.com/x/list/tech
25 | 教育 = http://v.qq.com/x/list/education
26 | 财经 = http://v.qq.com/x/list/finance
27 | 房产 = http://v.qq.com/x/list/house
28 | 旅游 = http://v.qq.com/x/list/travel
29 | 王者荣耀 = http://v.qq.com/x/list/kings
30 |
31 | [toutiao]
32 | #only one list page and don't rely on list page url
33 | list_url = toutiao
34 |
35 | [iqiyi]
36 | 纪录片 = http://list.iqiyi.com/www/3/-------------4-fangyucheng-2-iqiyi--.html
37 | 游戏 = http://list.iqiyi.com/www/8/-------------4-fangyucheng-2-iqiyi--.html
38 | 资讯 = http://list.iqiyi.com/www/25/-------------4-fangyucheng-2-iqiyi-1-.html
39 | 娱乐 = http://list.iqiyi.com/www/7/-------------4-fangyucheng-2-iqiyi-1-.html
40 | 财经 = http://list.iqiyi.com/www/24/-------------4-fangyucheng-2-iqiyi--.html
41 | 片花-电影 = http://list.iqiyi.com/www/10/1007-------------4-fangyucheng-2--1-.html
42 | 片花-电视剧 = http://list.iqiyi.com/www/10/1006-------------4-fangyucheng-2--1-.html
43 | 音乐 = http://list.iqiyi.com/www/5/-------------4-fangyucheng-2-iqiyi--.html
44 | 军事 = http://list.iqiyi.com/www/28/-------------4-fangyucheng-2-iqiyi-1-.html
45 | 教育 = http://list.iqiyi.com/www/12/-------------4-fangyucheng-2-iqiyi-1-.html
46 | 体育 = http://list.iqiyi.com/www/17/-------------4-fangyucheng-2-iqiyi--.html
47 |
48 | [youku]
49 | 全部 = http://list.youku.com/category/video/c_0_d_1_s_2_p_fangyucheng.html
50 | 音乐 = http://list.youku.com/category/show/c_95_s_5_d_1_p_fangyucheng.html
51 | 资讯 = http://list.youku.com/category/video/c_91_d_1_s_2_p_fangyucheng.html
52 | 搞笑 = http://list.youku.com/category/video/c_94_d_1_s_2_p_fangyucheng.html
53 | 生活 = http://list.youku.com/category/video/c_103_d_1_s_2_p_fangyucheng.html
54 | 汽车 = http://list.youku.com/category/video/c_104_d_1_s_2_p_fangyucheng.html
55 | 科技 = http://list.youku.com/category/video/c_105_d_1_s_2_p_fangyucheng.html
56 | 时尚 = http://list.youku.com/category/video/c_89_d_1_s_2_p_fangyucheng.html
57 | 亲子 = http://list.youku.com/category/video/c_90_d_1_s_2_p_fangyucheng.html
58 | 旅游 = http://list.youku.com/category/video/c_88_d_1_s_2_p_fangyucheng.html
59 | 微电影 = http://list.youku.com/category/video/c_171_d_1_s_2_p_2_fangyucheng.html
60 | 网剧 = http://list.youku.com/category/video/c_172_d_1_s_2_p_2_fangyucheng.html
61 | 拍客 = http://list.youku.com/category/video/c_174_d_1_s_2_p_2_fangyucheng.html
62 | 创意视频 = http://list.youku.com/category/video/c_175_d_1_s_2_p_2_fangyucheng.html
63 | 自拍 = http://list.youku.com/category/video/c_176_d_1_s_2_p_2_fangyucheng.html
64 | 广告 = http://list.youku.com/category/video/c_102_d_1_s_2_p_2_fangyucheng.html
--------------------------------------------------------------------------------
/crawler_sys/framework/config/redis_key_register.ini:
--------------------------------------------------------------------------------
1 | [v_qq]
2 | list_page_html = v_qq_list_page_html
3 |
--------------------------------------------------------------------------------
/crawler_sys/framework/config/search_keywords - 副本.ini:
--------------------------------------------------------------------------------
1 | # For every keyword, there should be an search_pages value, in the same order
2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
3 | # space cannot be just stripped out.
4 |
5 | [腾讯新闻]
6 | keywords = 致敬中国英雄
7 | search_pages = 20
8 |
9 | [腾讯视频]
10 | keywords = 致敬中国英雄
11 | search_pages = 20
12 |
13 | [new_tudou]
14 | keywords = 致敬中国英雄
15 | search_pages = 20
16 |
17 | [toutiao]
18 | keywords = 致敬中国英雄
19 | search_pages = 20
20 |
21 | [youku]
22 | keywords = 致敬中国英雄
23 | search_pages = 2
24 |
25 | [pearvideo]
26 | keywords = 任正非 BBC
27 | search_pages = 2
28 |
29 | [bilibili]
30 | keywords = 任正非 BBC
31 | search_pages = 2
32 |
--------------------------------------------------------------------------------
/crawler_sys/framework/config/search_keywords.ini:
--------------------------------------------------------------------------------
1 | # For every keyword, there should be an search_pages value, in the same order
2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
3 | # space cannot be just stripped out.
4 |
5 | [腾讯新闻]
6 | keywords = 2019东方卫视春晚,2019东方卫视跨年演唱会,BesTV百视通,SMG摄界,SMG阳阳STUDIO,第一财经,第一财经《财经早班车》,东方卫视,东方卫视-东方新闻,东方卫视-看东方,东方午新闻,动感101TV,话匣子,酱紫娱乐,交叉点看,究竟视频,剧说有毒,看东方,看看新闻Knews,可凡倾听,青春旅社,上海电视台新闻综合频道,新闻坊,游戏风云gamefy,最美公路,北京时间,时间财经,时间测评,时间国际视频,时间新闻,时间新闻视频,时间直播,大揭秘,大王小王湖北卫视,非正式课堂,湖北电视台《钓鱼频道》,湖北广电纪录片部,湖北经视,湖北调解面对面,经视大直播,经视好吃佬,经视乐生活,经视人家,调解现场,问新闻,笑啦,长江新闻号,长江新闻号湖北卫视,长江云,《奇兵神犬》,《我是大侦探》,风影车都,湖南电视剧频道,湖南电视台《风影车都》,湖南电影频道,湖南都市频道,湖南广播电视台茶频道,湖南经视,湖南卫视,湖南卫视芒果云,湖南卫视天天向上,湖南娱乐,幻乐之城,金鹰卡通卫视,快乐垂钓频道,芒果V直播,芒果都市,平民英雄,亲爱的客栈,亲爱的客栈第2季,我家那小子,我是未来,我想和你唱,欲望都市show,中餐厅,中餐厅第二季,江苏卫视官方帐号,江苏卫视幸福剧场,江苏新闻,江苏新闻广播,金曲捞,荔枝新闻,南京零距离,无限歌谣季,新闻眼,缘来非诚勿扰,动历史,老板联播,梨北京,梨青岛,梨视频,梨视频ING直播,梨视频微视,梨视频游戏,一手Video,澎湃视频,澎湃新闻,第一现场,深圳卫视,深圳卫视《军情直播间》,深圳卫视《正午30分》,深圳卫视军情直播间,深圳卫视正午30分,深圳卫视直播港澳台,正午30分,直播港澳台,新京报,新京报动新闻,新京报经济新闻,新京报书评周刊,1818黄金眼,2019浙江卫视领跑演唱会,FM988浙江新闻广播,奔跑吧兄弟 第4季,大冰小将,范大姐帮忙,钱江视频,熟悉的味道 第2季,喜剧总动员,喜剧总动员 第2季,小强实验室,异口同声,浙江广播电视台,浙江经视新闻,浙江台车行天下,浙江卫视,浙江卫视《新闻深一度》,浙江新闻频道,浙江之声,中国蓝TV,中国蓝新闻,中国蓝新闻蓝媒视频,看看新闻Knews,任正非 BBC
7 | search_pages = 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,30,30,2
8 |
9 | [腾讯视频]
10 | keywords = 任正非 BBC
11 | search_pages = 2
12 |
13 | [new_tudou]
14 | keywords = 任正非 BBC
15 | search_pages = 2
16 |
17 | [toutiao]
18 | keywords = 任正非 BBC
19 | search_pages = 2
20 |
21 | [youku]
22 | keywords = 任正非 BBC
23 | search_pages = 2
24 |
25 | [pearvideo]
26 | keywords = 任正非 BBC
27 | search_pages = 2
28 |
29 | [bilibili]
30 | keywords = 任正非 BBC
31 | search_pages = 2
32 |
--------------------------------------------------------------------------------
/crawler_sys/framework/update_data_in_target_releasers_single_thread.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon May 14 17:52:02 2018
4 |
5 | Find urls in given releaser page, and write first batch data into es.
6 | Everytime this program runs, two things will happen:
7 | 1 All video urls in given releaser page will be fetched and put into redis url pool,
8 | 2 All data related to 1 will be fetched and stored into es.
9 |
10 | Data in es will be update when run this program once.
11 |
12 | @author: hanye
13 | """
14 |
15 | import argparse
16 | from crawler_sys.framework.platform_crawler_register import get_crawler
17 | from crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
18 | from crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
19 |
20 | parser = argparse.ArgumentParser(description='Specify a platform name.')
21 | parser.add_argument('-p', '--platform', default=[], action='append',
22 | help=('Pass platform names, they will be assembled in python list.'))
23 | parser.add_argument('-n', '--max_page', default=30, type=int,
24 | help=('The max page numbers to be scroll for each releaser url, '
25 | 'must be an int value, default to 30.'))
26 | parser.add_argument('-f', '--output_file_path', default='', type=str,
27 | help=('Specify output file path, default None.'))
28 | parser.add_argument('-r', '--push_to_redis', default='False', type=str,
29 | help=('Write urls to redis or not, default to True'))
30 | parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
31 | help=('Write data into es or not, default to True'))
32 | parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
33 | help=('Write data into es or not, default to True'))
34 | parser.add_argument('-l', '--releasers', default=[], action='append',
35 | help=('Write data into es or not, default to True'))
36 | parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
37 | help=('target_releasers_org or target_releasers'))
38 | args = parser.parse_args()
39 |
40 | if args.platform != []:
41 | platforms = args.platform
42 | else:
43 | platforms = [
44 | 'toutiao',
45 | '腾讯视频',
46 | 'iqiyi',
47 | 'youku',
48 | '腾讯新闻',
49 | 'haodkan',
50 | 'new_tudou',
51 | "kwai"
52 | ]
53 | releaser_page_num_max = args.max_page
54 | output_f_path = args.output_file_path
55 | if output_f_path == '':
56 | output_to_file = False
57 | else:
58 | output_to_file = True
59 |
60 | push_to_redis = parse_bool_for_args(args.push_to_redis)
61 | output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
62 | output_to_es_register = parse_bool_for_args(args.output_to_es_register)
63 |
64 | releaser_Lst = args.releasers
65 |
66 | for platform in platforms:
67 | # 2 get releaserUrl list on each platform from target-releasers index
68 | if not releaser_Lst:
69 | releaserUrl_Lst = get_releaserUrls_from_es(platform=platform,target_index=args.target_index)
70 | else:
71 | releaserUrl_Lst = []
72 | for releaser in releaser_Lst:
73 | releaserUrl_Lst.extend(get_releaserUrls_from_es(platform=platform,target_index=args.target_index))
74 | if releaserUrl_Lst == []:
75 | print('Get empty releaserUrl_Lst for platform %s' % platform)
76 | continue
77 | # 3 get crawler for this platform
78 | Platform_crawler = get_crawler(platform)
79 | if Platform_crawler != None:
80 | crawler_instant = Platform_crawler()
81 | else:
82 | print('Failed to get crawler for platform %s' % platform)
83 | continue
84 | # 4 for each releaserUrl, get data on the releaser page identified by this
85 | # releaser url
86 | for releaserUrl in releaserUrl_Lst:
87 | crawler_instant.releaser_page(releaserUrl[0],
88 | output_to_file=output_to_file,
89 | filepath=output_f_path,
90 | releaser_page_num_max=releaser_page_num_max,
91 | output_to_es_raw=output_to_es_raw,
92 | output_to_es_register=output_to_es_register,
93 | push_to_redis=push_to_redis
94 | )
95 |
--------------------------------------------------------------------------------
/crawler_sys/framework/video_fields_std.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Feb 26 17:57:38 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | class Std_fields_video:
9 | def __init__(self, data_provider=None):
10 | if data_provider==None:
11 | data_provider='BDD'
12 | self.video_data={
13 | 'platform': None,
14 | 'channel': None,
15 | # 'channel_url': None,
16 | # 'channel_subdomain': None,
17 | 'describe': None,
18 | 'title': None,
19 | 'url': None,
20 | 'duration': 0,
21 | 'releaser': None,
22 | 'play_count': None,
23 | 'favorite_count': 0,
24 | 'comment_count': 0,
25 | # 'dislike_count': None,
26 | 'repost_count': None,
27 | 'isOriginal': None,
28 | 'data_provider': data_provider,
29 | 'video_id': None,
30 |
31 | 'releaserUrl': None,
32 | 'release_time': 0,
33 | 'fetch_time': 0,
34 | }
35 |
36 | def field_type_correct(self):
37 | def none_is_allowed(field, field_type):
38 | if field not in self.video_data:
39 | return 'field "%s" is absent' % field
40 | is_correct=(isinstance(self.video_data[field], field_type)
41 | or self.video_data[field]==None)
42 | if is_correct:
43 | return True
44 | else:
45 | return 'field "%s" should be of [%s] type or None' % (field, field_type.__name__)
46 |
47 | def none_is_not_allowed(field, field_type):
48 | if field not in self.video_data:
49 | return 'FATAL: field "%s" is NOT FOUND!' % field
50 | else:
51 | is_correct=isinstance(self.video_data[field], field_type)
52 | if is_correct:
53 | return True
54 | else:
55 | return 'field "%s" should be of [%s] type' % (field, field_type.__name__)
56 |
57 | platform_chk=none_is_not_allowed('platform', str)
58 | duration_chk=none_is_not_allowed('duration', int)
59 | play_count_chk=none_is_not_allowed('play_count', int)
60 | favorite_count_chk=none_is_allowed('favorite_count', int)
61 | comment_count_chk=none_is_allowed('comment_count', int)
62 | # dislike_count_chk=none_is_allowed('dislike_count', int)
63 | repost_count_chk=none_is_allowed('repost_count', int)
64 | isOriginal_chk=none_is_allowed('isOriginal', bool)
65 | release_time_chk=none_is_allowed('release_time', int)
66 | fetch_time_chk=none_is_not_allowed('fetch_time', int)
67 |
68 | type_chk={
69 | 'platform': platform_chk,
70 | 'duration': duration_chk,
71 | 'play_count': play_count_chk,
72 | 'favorite_count': favorite_count_chk,
73 | 'comment_count': comment_count_chk,
74 | # 'dislike_count': dislike_count_chk,
75 | 'repost_count': repost_count_chk,
76 | 'isOriginal': isOriginal_chk,
77 | 'release_time': release_time_chk,
78 | 'fetch_time': fetch_time_chk,
79 | }
80 |
81 | false_time_warning=[]
82 | for field in type_chk:
83 | if type_chk[field]!=True:
84 | false_time_warning.append(type_chk[field])
85 |
86 | if false_time_warning==[]:
87 | return True
88 | else:
89 | warn_msg='WARNING: ' + '\n'.join(false_time_warning)
90 | return warn_msg
91 |
92 |
93 |
94 | if __name__=='__main__':
95 | a=Std_fields_video()
96 | a.video_data['platform']='腾讯视频'
97 | a.video_data['duration']=62
98 | a.video_data['title']='This is test video'
99 |
100 | # a.video_data={'duration': 62}
101 | print(a.field_type_correct())
102 |
--------------------------------------------------------------------------------
/crawler_sys/framework/write_releaserUrl_to_crawler_redis.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/9/11 11:48
3 | # @Author : litao
4 | import redis
5 | import json
6 | import datetime
7 | from redis.sentinel import Sentinel
8 |
9 | sentinel = Sentinel([('192.168.17.65', 26379),
10 | ('192.168.17.66', 26379),
11 | ('192.168.17.67', 26379)
12 | ], socket_timeout=0.5)
13 | # 查看master节点
14 | master = sentinel.discover_master('ida_redis_master')
15 | # 查看slave 节点
16 | slave = sentinel.discover_slaves('ida_redis_master')
17 | # 连接数据库
18 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=2, decode_responses=True)
19 |
20 |
21 | def write_project_to_redis(project):
22 | rds.rpush("project", project)
23 |
24 |
25 | def write_releaserUrl_to_redis(project, data,email_dic):
26 | project_name, duration = project.split("/")
27 | data_dict_for_redis = {"duration": duration, "data": json.dumps(data),"email":json.dumps(email_dic)}
28 | rds.hmset(project, data_dict_for_redis)
29 | write_project_to_redis(project)
30 |
31 |
32 | def read_csv_write_into_redis(project_name, csv_file, crawler_days,email_dic=None):
33 | try:
34 | crawler_lis = []
35 | with open(csv_file, 'r', encoding="gb18030")as f:
36 | header_Lst = f.readline().strip().split(',')
37 | for line in f:
38 | line_Lst = line.strip().split(',')
39 | line_dict = dict(zip(header_Lst, line_Lst))
40 | platform = line_dict['platform']
41 | releaserUrl = line_dict['releaserUrl']
42 | crawler_lis.append(platform+"&"+releaserUrl)
43 | write_releaserUrl_to_redis("{0}/{1}".format(project_name, str(crawler_days)), crawler_lis,email_dic=email_dic)
44 | return True
45 | except:
46 | return False
47 |
48 |
49 | def down_task():
50 | res = rds.hgetall("task_down")
51 | if res:
52 | rds.delete("task_down")
53 | return res
54 | else:
55 | return None
56 |
57 |
58 | if __name__ == "__main__":
59 | # 传入的email_dic 格式如下
60 | mapping_dic = {
61 | "taskname": "127869453",
62 | "file_path": None,
63 | "data_str": None,
64 | "email_group": ["litao@csm.com.cn"],
65 | "email_msg_body_str": "任务已完成",
66 | "title_str": "任务已完成",
67 | "cc_group": [],
68 | "sender": "litao@csm.com.cn"
69 | }
70 | crawler_lis = []
71 | file = r'D:\work_file\发布者账号\一次性需求附件\大客户总台垂类及主持人增加账号.csv'
72 | read_csv_write_into_redis("task15", file, "1577808000000",email_dic=mapping_dic)
73 | print(down_task())
74 | # with open(file, 'r')as f:
75 | # header_Lst = f.readline().strip().split(',')
76 | # for line in f:
77 | # line_Lst = line.strip().split(',')
78 | # line_dict = dict(zip(header_Lst,line_Lst))
79 | # platform = line_dict['platform']
80 | # releaserUrl = line_dict['releaserUrl']
81 | # crawler_lis.append(platform+"_"+releaserUrl)
82 | # write_releaserUrl_to_redis("project_name/1564588800000/3",crawler_lis)
83 |
--------------------------------------------------------------------------------
/crawler_sys/framework/write_releasers_to_redis.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/4/24 14:15
3 | # @Author : litao
4 | # -*- coding: utf-8 -*-
5 | """
6 | Created on Mon May 14 17:52:02 2018
7 |
8 | Find urls in given releaser page, and write first batch data into es.
9 | Everytime this program runs, two things will happen:
10 | 1 All video urls in given releaser page will be fetched and put into redis url pool,
11 | 2 All data related to 1 will be fetched and stored into es.
12 |
13 | Data in es will be update when run this program once.
14 |
15 | @author: hanye
16 | """
17 |
18 | import sys
19 | import argparse,copy
20 | from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
21 | from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
22 | import redis,json
23 |
24 |
25 | from redis.sentinel import Sentinel
26 | sentinel = Sentinel([('192.168.17.65', 26379),
27 | ('192.168.17.66', 26379),
28 | ('192.168.17.67', 26379)
29 | ],socket_timeout=0.5)
30 | # 查看master节点
31 | master = sentinel.discover_master('ida_redis_master')
32 | # 查看slave 节点
33 | slave = sentinel.discover_slaves('ida_redis_master')
34 | # 连接数据库
35 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
36 | # rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
37 |
38 | parser = argparse.ArgumentParser(description='Specify a platform name.')
39 | parser.add_argument('-p', '--platform', default=[], action='append',
40 | help=('Pass platform names, they will be assembled in python list.'))
41 | parser.add_argument('-pj', '--project_tags', default=[], action='append',
42 | help=('Pass platform names, they will be assembled in python list.'))
43 | parser.add_argument('-n', '--max_page', default=2, type=int,
44 | help=('The max page numbers to be scroll for each releaser url, '
45 | 'must be an int value, default to 30.'))
46 | parser.add_argument('-fre', '--frequency', default=1, type=int,
47 | help=('choose a frequency to retrieve releaserUrl,'
48 | '1, 3 or 9 is legal number, default 1'))
49 | parser.add_argument('-proxies', '--proxies', default=0, type=int,
50 | help=('Crawler proxies_num'))
51 | parser.add_argument('-d', '--date', default=3, type=int,
52 | help=('Crawler backtracking data time'))
53 | parser.add_argument('-s', '--processes_num', default=5, type=int,
54 | help=('Processes number to be used in multiprocessing'))
55 | parser.add_argument('-article', '--article', default=0, type=int,
56 | help=('is article page'))
57 | args = parser.parse_args()
58 |
59 |
60 | if args.platform != []:
61 | platforms = args.platform
62 | else:
63 | print('platform must be input')
64 | sys.exit(0)
65 |
66 |
67 | releaser_page_num_max = args.max_page
68 | frequency = args.frequency
69 | if frequency == '':
70 | frequency = None
71 |
72 | processes_num = args.processes_num
73 | frequency = args.frequency
74 | print(frequency)
75 | if frequency == 0:
76 | frequency = None
77 |
78 |
79 | kwargs_dict = {
80 | "proxies_num": 0,
81 | "date":args.date,
82 | }
83 | if frequency:
84 | if frequency >= 3:
85 | kwargs_dict["proxies_num"] = 3
86 | if args.proxies:
87 | kwargs_dict["proxies_num"] = args.proxies
88 | is_article = args.article
89 |
90 | def write_project_to_redis(platform, data):
91 | rds.rpush(platform, data)
92 |
93 |
94 | def write_releaserUrl_to_redis(data_dic):
95 | write_project_to_redis(data_dic["platform"], json.dumps(data_dic))
96 |
97 |
98 | for platform in platforms:
99 | # 2 get releaserUrl list on each platform from target-releasers index
100 | releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers",project_tags=args.project_tags)
101 | if is_article:
102 | platform = platform + "_article"
103 | rds.hset("process_num",platform,processes_num)
104 | if releaserUrl_Lst == []:
105 |
106 | print('Get empty releaserUrl_Lst for platform %s' % platform)
107 | continue
108 | # 3 get crawler for this platform
109 | for releaserUrl,releaser in releaserUrl_Lst:
110 | push_dic = {
111 | "releaserUrl":releaserUrl,
112 | "releaser":releaser,
113 | "platform":platform,
114 | }
115 | push_dic.update(kwargs_dict)
116 | write_releaserUrl_to_redis(push_dic)
117 |
--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/2/25 15:20
3 | # @Author : litao
4 |
--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_main_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/3/5 15:34
3 | # @Author : litao
4 | import datetime
5 | from crawler.crawler_sys.hot_words_crawler.crawler_wangyi_news import Crawler_WangYi_News
6 | from crawler.crawler_sys.hot_words_crawler.crawler_v_qq import Crawler_v_qq
7 | from crawler.crawler_sys.hot_words_crawler.crawler_haokan import CrawlerHaoKan
8 | from crawler.crawler_sys.hot_words_crawler.crawler_douyin import Crawler_douyin
9 | from crawler.crawler_sys.hot_words_crawler.crawler_qq_news import Crawler_Qq_News
10 | from crawler.crawler_sys.hot_words_crawler.crawler_new_tudou import CrawlerNewTudou
11 | from crawler.crawler_sys.hot_words_crawler.crawler_toutiao import Crawler_toutiao
12 | from crawler.crawler_sys.hot_words_crawler.crawler_kwai import Crawler_kwai
13 |
14 | from elasticsearch import Elasticsearch
15 | from elasticsearch.helpers import scan
16 |
17 | es = Elasticsearch(hosts='192.168.17.11', port=80,
18 | http_auth=('crawler', 'XBcasfo8dgfs'))
19 |
20 | now = int(datetime.datetime.now().timestamp() * 1e3) - 86400000
21 | platform_dic = {
22 | "kwai": Crawler_kwai(),
23 | "toutiao": Crawler_toutiao(),
24 | "haokan": CrawlerHaoKan(),
25 | "抖音": Crawler_douyin(),
26 | "腾讯视频": Crawler_v_qq(),
27 | "腾讯新闻": Crawler_Qq_News(),
28 | "new_tudou": CrawlerNewTudou(),
29 | "网易新闻": Crawler_WangYi_News()
30 | }
31 |
32 | for platform in platform_dic:
33 | res = platform_dic[platform].get_hot_words()
34 | print(platform, res)
35 |
36 | search_body = {
37 | "query": {
38 | "bool": {
39 | "filter": [
40 | # {"term":{"platform.keyword":"腾讯视频"}},
41 | {"range": {"fetch_time": {"gte": now}}}
42 | ]
43 | }
44 | }
45 | }
46 | keyword_scan = scan(client=es, index='short-video-hotwords', query=search_body, doc_type="doc")
47 | res_list = []
48 | for res in keyword_scan:
49 | res_list.append(res["_source"])
50 |
51 | for res in res_list:
52 | try:
53 | platform_dic[res["platform"]].get_hot_videos(**res)
54 | except Exception as e:
55 | print(res["platform"], e)
56 | continue
57 |
--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_new_tudou.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/2/25 15:23
3 | # @Author : litao
4 | import requests
5 | import json, re, datetime,urllib
6 | from crawler.crawler_sys.utils.output_results import retry_get_url
7 | from crawler.crawler_sys.utils.output_results import hot_words_output_result
8 |
9 |
10 | class CrawlerNewTudou(object):
11 | def __init__(self):
12 | self.platform = "new_tudou"
13 | self.headers = {
14 | "Cookie": "",
15 | "User-Agent": "Tudou;6.39.1;Android;5.1.1;OPPO R11",
16 | "Accept-Encoding": "gzip,deflate",
17 | "Connection": "close",
18 | "Host": "apis.tudou.com",
19 | }
20 | def get_hot_words(self):
21 | bulk_list = []
22 |
23 | url = "https://apis.tudou.com/search/v1/hot?_t_={0}&e=md5&_s_=9a4abf3a92efad0605f8e31481327014&operator=CHINA+MOBILE_46007&network=WIFI".format(
24 | int(datetime.datetime.now().timestamp()))
25 | res = retry_get_url(url,proxies=3,headers=self.headers)
26 | res_json = res.json()
27 | for title in res_json["result"]["search"]["data"]:
28 | dic = {
29 | "platform": self.platform,
30 | "title": title["keyword"],
31 | "fetch_time":int(datetime.datetime.now().timestamp()*1e3)
32 | }
33 | bulk_list.append(dic)
34 | hot_words_output_result(bulk_list)
35 | return True
36 |
37 | def get_hot_videos(self,title=None,max_page=10,**kwargs):
38 | page = 1
39 | while page <= max_page:
40 | get_dic = {
41 | "keyword": title,
42 | # "pid": "6c23a6957198fad2",
43 | # "guid": "2139ff131a8a7d9ef7d3014cc8b97010",
44 | "mac": "",
45 | "imei": "null",
46 | "ver": "6.39.1",
47 | "_t_": int(datetime.datetime.now().timestamp()),
48 | "e": "md5",
49 | # "_s_": "b905d3a9738d7d2f815687428563d8f7",
50 | "operator": "CHINA+MOBILE_46007",
51 | "network": "WIFI",
52 | "ftype": "0",
53 | "cateId": "0",
54 | "seconds": "0",
55 | "seconds_end": "0",
56 | "ob": "",
57 | "pg": str(page),
58 | "pz": "30",
59 | # "aaid": "1.58259884569785E+20",
60 | "brand": "OPPO",
61 | "btype": "OPPO+R11",
62 | "sdkver": "2",
63 | "apad": "0",
64 | # "utdid": "XkjV9GsfBysDACyQ2%2BiF8MOw",
65 | "srid": "1",
66 | "userType": "guest",
67 | }
68 | requests_res = retry_get_url("https://apis.tudou.com/search/v2/integration?%s"%urllib.parse.urlencode(get_dic),headers=self.headers,proxies=3)
69 | requests_json = requests_res.json()
70 | page += 1
71 | print(requests_json)
72 | for data in requests_json["results"]["ugc"]["data"]:
73 | print(data)
74 |
75 |
76 | if __name__ == "__main__":
77 | crawler = CrawlerNewTudou()
78 | # crawler.get_hot_words()
79 | crawler.get_hot_videos("范冰冰蛋糕裙")
--------------------------------------------------------------------------------
/crawler_sys/hot_words_crawler/crawler_wangyi_news.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/3/2 11:07
3 | # @Author : litao
4 | # -*- coding:utf-8 -*-
5 | # @Time : 2020/2/28 12:09
6 | # @Author : litao
7 |
8 |
9 | import requests
10 | import json, re, datetime, urllib
11 | from crawler.crawler_sys.utils.output_results import retry_get_url
12 | from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result
13 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
14 | from write_data_into_es.func_cal_doc_id import *
15 | import base64
16 | from crawler.crawler_sys.site_crawler.crawler_wangyi_news import Crawler_wangyi_news as Crawler_wy
17 | crawler_qq_video_page = Crawler_wy().video_page
18 |
19 |
20 | class Crawler_WangYi_News(object):
21 | def __init__(self):
22 | self.platform = "网易新闻"
23 | timestamp = int(datetime.datetime.now().timestamp() * 1e3)
24 | self.headers = {
25 | "data4-Sent-Millis": str(timestamp),
26 | "Add-To-Queue-Millis": str(timestamp),
27 | "User-D": "2zx5YfHmoBb72ayxYpQVUg==",
28 | "User-N": "HPcUw15+Yla9nvIP1c9vbqrHfvh/PCmpfK2DVDjsFFGhp4IV17bdU7hTwNc3Kfe3gVZiNSrnIe+bsZBFoMsbZQ==",
29 | "httpDNSIP": "101.71.145.130",
30 | "User-C": "5aS05p2h",
31 | "User-Agent": "NewsApp/32.1 Android/5.1.1 (OPPO/OPPO R11)",
32 | "X-NR-Trace-Id": "%s_355730601_866174725888628" % timestamp,
33 | "Host": "c.m.163.com",
34 | "Connection": "Keep-Alive",
35 | "Accept-Encoding": "gzip",
36 | }
37 |
38 | def get_hot_words(self):
39 | bulk_list = []
40 | url = "http://c.m.163.com/nc/search/hotWord.html"
41 | page_res = retry_get_url(url, headers=self.headers, proxies=3, timeout=5)
42 | page_json = page_res.json()
43 | for data in page_json["hotWordList"]:
44 | title = data["searchWord"]
45 | if title:
46 | dic = {
47 | "platform": self.platform,
48 | "title": title,
49 | "fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
50 | }
51 | bulk_list.append(dic)
52 | hot_words_output_result(bulk_list)
53 | return True
54 |
55 |
56 | def search_page(self,title):
57 | data_list = []
58 | encodestr = base64.b64encode(title.encode('utf-8'))
59 | encodestr = str(encodestr, 'utf-8')
60 | url = "http://c.m.163.com/search/comp2/Kg%3D%3D/20/{0}.html?".format(encodestr)
61 | para = "deviceId=2zx5YfHmoBb72ayxYpQVUg%3D%3D&version=newsclient.32.1.android&channel=VDEzNDg2NDc5MDkxMDc%3D&canal=bmV3c19sZl9jcGFfMg%3D%3D&dtype=0&tabname=shipin&position=5YiX6KGo6aG26YOo&ts={0}&sign=Di3opZw%2FFIPDdgreSK4VCKlnMSpm6FPoel5LeY88RgZ48ErR02zJ6%2FKXOnxX046I&spever=FALSE&open=scheme_%E9%BB%98%E8%AE%A4&openpath=/video/VT5O1KVCO".format(str(int(datetime.datetime.now().timestamp())))
62 | res = retry_get_url(url+para, headers=self.headers, timeout=5, proxies=3)
63 | page_text = res.json()
64 | for data in page_text["doc"]["result"]:
65 | print(data)
66 | data_list.append(data)
67 | output_result(result_Lst=data_list,
68 | platform=self.platform,
69 | output_to_es_raw=True,
70 | )
71 | data_list.clear()
72 | ## sign和ts为加密字段 无法解决
73 |
74 | def get_hot_videos(self, max_page=10,**kwargs):
75 | pass
76 |
77 |
78 | if __name__ == "__main__":
79 | crawler = Crawler_WangYi_News()
80 | crawler.get_hot_words()
81 | crawler.search_page("患者私自出院散步")
82 | # crawler.get_hot_videos("https://v.qq.com/x/search/?q=%E6%95%99%E8%82%B2%E9%83%A8%E5%9B%9E%E5%BA%94%E6%89%A9%E5%A4%A7%E7%A1%95%E5%A3%AB%E5%92%8C%E4%B8%93%E5%8D%87%E6%9C%AC%E6%8B%9B%E7%94%9F&stag=12",channel="教育部回应扩大硕士和专升本招生")
--------------------------------------------------------------------------------
/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/9/12 10:19
3 | # @Author : litao
4 |
5 | """
6 | 私密代理使用示例
7 | 接口鉴权说明:
8 | 目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。
9 | 所有方法均可添加关键字参数sign_type修改鉴权方式。
10 | """
11 | import redis,random
12 | import kdl,requests
13 |
14 | from redis.sentinel import Sentinel
15 |
16 | sentinel = Sentinel([('192.168.17.65', 26379),
17 | ('192.168.17.66', 26379),
18 | ('192.168.17.67', 26379)
19 | ], socket_timeout=0.5)
20 | # 查看master节点
21 | master = sentinel.discover_master('ida_redis_master')
22 | # 查看slave 节点
23 | slave = sentinel.discover_slaves('ida_redis_master')
24 | # 连接数据库
25 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
26 | # rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True)
27 | def get_proxy_from_redis():
28 | try:
29 | one_proxy = rds.randomkey()
30 | username = "hanye"
31 | password = "i9mmu0a3"
32 |
33 | proxies = {
34 | "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
35 | "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
36 | }
37 | return proxies
38 | except Exception as e:
39 | print(e)
40 | return None
41 |
42 | def func_get_proxy_to_redis():
43 | # chance = random.random()
44 | auth = kdl.Auth("997803479675913", "er2siw6i58c61s387sqljvovoz8zybaq")
45 | client = kdl.Client(auth)
46 |
47 | # 获取订单到期时间, 返回时间字符串
48 | # expire_time = client.get_order_expire_time()
49 | # print("expire time", expire_time)
50 |
51 | # 获取ip白名单, 返回ip列表
52 | # ip_whitelist = client.get_ip_whitelist()
53 | # print("ip whitelist", ip_whitelist)
54 |
55 | # 设置ip白名单,参数类型为字符串或列表或元组
56 | # 成功则返回True, 否则抛出异常
57 | # client.set_ip_whitelist([])
58 | # client.set_ip_whitelist("127.0.0.1, 192.168.0.139")
59 | # print(client.get_ip_whitelist())
60 | # client.set_ip_whitelist(tuple())
61 |
62 | # 提取私密代理ip, 第一个参数为提取的数量, 其他参数以关键字参数的形式传入(不需要传入signature和timestamp)
63 | # 具体有哪些参数请参考帮助中心: "https://help.kuaidaili.com/api/getdps/"
64 | # 返回ip列表
65 | # 注意:若您使用的是python2, 且在终端调用,或在文件中调用且没有加 "# -*- coding: utf-8 -*-" 的话
66 | # 传入area参数时,请传入unicode类型,如 area=u'北京,上海'
67 | # ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
68 | # print("dps proxy: ", ips)
69 |
70 |
71 | # 检测私密代理有效性: 返回 ip: true/false 组成的dict
72 | #ips = client.get_dps(1, sign_type='simple', format='json')
73 | # valids = client.check_dps_valid(ips)
74 | # print("valids: ", valids)
75 |
76 | # 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
77 | ips = client.get_dps(1, format='json',dedup=1)
78 | seconds = client.get_dps_valid_time(ips)
79 | # print("seconds: ", seconds)
80 | for key in seconds:
81 | rds.set(key, key, ex=int(seconds[key]) - 3)
82 |
83 | # 获取计数版ip余额(仅私密代理计数版)
84 | # balance = client.get_ip_balance(sign_type='hmacsha1')
85 | # print("balance: ", balance)
86 | def proxy_test(proxies):
87 | page_url = "http://dev.kdlapi.com/testproxy/"
88 | headers = {
89 | "Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快
90 | }
91 |
92 | res = requests.get(url=page_url, proxies=proxies, headers=headers)
93 | # print(res.status_code) # 获取Reponse的返回码
94 | if res.status_code == 200:
95 | print(res.content.decode('utf-8')) # 获取页面内容
96 |
97 | def get_proxy_dic(max_proxies=None):
98 | if not max_proxies:
99 | max_proxies = 8
100 | try:
101 | res = rds.dbsize()
102 | except Exception as e:
103 | print("redis error")
104 | return None
105 | if res is None:
106 | return None
107 | if res < max_proxies:
108 | func_get_proxy_to_redis()
109 | return get_proxy_from_redis()
110 | else:
111 | return get_proxy_from_redis()
112 |
113 | def get_proxy(proxies_num=None):
114 | if proxies_num:
115 | proxies = get_proxy_dic(max_proxies=proxies_num)
116 | # print("get a IP %s" % str(proxies))
117 | return proxies
118 | else:
119 | return None
120 |
121 | if __name__ == "__main__":
122 | proxy_pool_dic = get_proxy(11)
123 | print(proxy_pool_dic)
124 | proxy_test(proxy_pool_dic)
125 | print(get_proxy_from_redis())
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:43 2018
4 |
5 | @author: hanye
6 | """
7 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/chromedriver
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler.new_tudouAPP.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Apr 10 17:52:08 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import datetime
10 | import json
11 | import requests
12 | from bs4 import BeautifulSoup
13 |
14 | def get_video(target):
15 | result = []
16 | count = 0
17 | while len(result) < target and count < 100:
18 | listurl = 'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622736331'
19 | get_page = requests.get(listurl)
20 | get_page.encoding = 'utf-8'
21 | page = get_page.text
22 | print('get one page')
23 | page = page.replace('true', 'True')
24 | page = page.replace('false', 'False')
25 | page_dic = json.loads(page)['entity']
26 | for line in page_dic:
27 | midstep = line['detail']
28 | title = midstep['base_detail']['title']
29 | playcount = midstep['video_detail']['vv_desc']
30 | releaser = midstep['user_detail']['name']
31 | releaserid = midstep['user_detail']['id']
32 | videoid = midstep['video_detail']['video_id']
33 | duration = midstep['video_detail']['duration']
34 | url = 'http://new-play.tudou.com/v/'+videoid
35 | get_page = requests.get(url)
36 | page = get_page.text
37 | soup = BeautifulSoup(page, 'html.parser')
38 | rt_step1 = soup.find('div', {'class':'td-play__videoinfo__details-box__time'})
39 | rt_step2 = rt_step1.text[:-2]
40 | release_time = int(datetime.datetime.strptime(rt_step2,
41 | '%Y-%m-%d %H:%M:%S').timestamp()*1e3)
42 | D0 = {"title":title, "releaser":releaser, "release_time":release_time,
43 | "duration":duration, 'releaserid':releaserid, 'playcount':playcount}
44 | if D0 not in result:
45 | result.append(D0)
46 | print('added one video')
47 | else:
48 | count += 1
49 | print('repetition')
50 | return result
51 |
52 |
53 |
54 | if __name__=='__main__':
55 | try1 = get_video(target=200)
56 | #{'旅行':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10293',
57 | #'科技':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10199',
58 | #'娱乐':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622726317',
59 | #'萌物':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622485153'}
60 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler.pepper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Apr 10 11:06:53 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import requests
9 | import datetime
10 |
11 |
12 |
13 | class crawler_pepper():
14 |
15 | def except_followers(self):
16 | rank=[]
17 | urls=['http://webh.huajiao.com/rank/recv?&type=day','http://webh.huajiao.com/rank/recv?&type=week','http://webh.huajiao.com/rank/recv?&type=all','http://webh.huajiao.com/rank/sun?&type=day','http://webh.huajiao.com/rank/sun?&type=week','http://webh.huajiao.com/rank/sun?&type=all','http://webh.huajiao.com/rank/user?&type=day','http://webh.huajiao.com/rank/user?&type=all','http://webh.huajiao.com/rank/send?&type=day','http://webh.huajiao.com/rank/send?&type=week','http://webh.huajiao.com/rank/send?&type=all']
18 | for url in urls:
19 | get_page=requests.get(url)
20 | get_page.encoding='utf-8'
21 | page = get_page.text
22 | print(url)
23 | page_dic = eval(page)['data']['rank']
24 | one_rank=[]
25 | for one_video in page_dic:
26 | category=url[29:33]
27 | timespan=url[-3:]
28 | name=one_video['nickname']
29 | authorlevel=one_video['authorlevel']
30 | level=one_video['level']
31 | try:
32 | score=one_video['score']
33 | except KeyError:
34 | score=None
35 | uid=one_video['uid']
36 | currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3
37 | D0={'name':name,'level':level,'authorlevel':authorlevel,'score':score,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime}
38 | one_rank.append(D0)
39 | rank.append(one_rank)
40 | return rank
41 |
42 | def get_followers(self):
43 | rank=[]
44 | urls=['http://webh.huajiao.com/rank/followers?&type=day','http://webh.huajiao.com/rank/followers?&type=all']
45 | for url in urls:
46 | get_page=requests.get(url)
47 | get_page.encoding='utf-8'
48 | page = get_page.text
49 | print(url)
50 | midstep = eval(page)
51 | page_dic = midstep['data']['rank']
52 | one_rank=[]
53 | for one_video in page_dic:
54 | category='followers'
55 | timespan=url[-3:]
56 | name=one_video['nickname']
57 | authorlevel=one_video['authorlevel']
58 | level=one_video['level']
59 | followers=one_video['followers']
60 | uid=one_video['uid']
61 | currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3
62 | D0={'name':name,'level':level,'authorlevel':authorlevel,'followers':followers,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime}
63 | one_rank.append(D0)
64 | rank.append(one_rank)
65 | return rank
66 |
67 |
68 | if __name__=='__main__':
69 | ttt =crawler_pepper()
70 | followers=ttt.get_followers()
71 | others=ttt.except_followers()
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_baidu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Aug 21 16:58:37 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | import requests
10 | from bs4 import BeautifulSoup
11 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
12 |
13 |
14 | headers = {'Host': 'www.baidu.com',
15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
18 | 'Accept-Encoding': 'gzip, deflate, br',
19 | 'Cookie': 'BAIDUID=5EBFCC8E193341115A4A3C71960B63E7:FG=1; BIDUPSID=BD339F6B0442001D2528C4BFBCE098DB; PSTM=1500974423; BDUSS=RCY0lFRmJ4MDlMMU5xfkp4NWU3bUlTckJOZU03ZTB4UHdJbUpUeWlVZmhlT3haSVFBQUFBJCQAAAAAAAAAAAEAAABM1D8MZmFuZzExMDExNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOHrxFnh68RZc3; MCITY=-131%3A; BD_UPN=1352; H_PS_PSSID=; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=1; BD_CK_SAM=1; PSINO=2; BDRCVFR[gltLrB7qNCt]=mk3SLVN4HKm; pgv_pvi=525595648; pgv_si=s2288931840; Hm_lvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; Hm_lpvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; BD_HOME=1; sug=3; sugstore=0; ORIGIN=0; bdime=21110; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDSVRTM=225; BDRCVFR[C0p6oIjvx-c]=I67x6TjHwwYf0; BAIDUPH=tn=§rn=§ct=0',
20 | 'Connection': 'keep-alive',
21 | 'Upgrade-Insecure-Requests': '1',
22 | 'Cache-Control': 'max-age=0'}
23 |
24 |
25 | def baidu_info_page(keyword, max_page_num):
26 | result_lst = []
27 | for page_num in range(0, max_page_num):
28 | search_url = ('https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd='+keyword+
29 | '&x_bfe_rqs=03E80&tngroupname=organic_news&pn='+str(page_num*10))
30 | get_page = requests.get(search_url, headers=headers)
31 | get_page.encoding = 'utf-8'
32 | page = get_page.text
33 | soup = BeautifulSoup(page, 'html.parser')
34 | info_lst = soup.find_all('div', {'class': 'result'})
35 | print_page_num = page_num+1
36 | for line in info_lst:
37 | title = line.h3.a.text
38 | title = title.replace('\n', '')
39 | url = line.h3.a['href']
40 | source_and_release_time = line.find('p', {'class': 'c-author'}).text
41 | source_and_release_time_lst = source_and_release_time.split('\xa0')
42 | source = source_and_release_time_lst[0]
43 | release_time_str = source_and_release_time_lst[-1]
44 | release_time = trans_strtime_to_timestamp(release_time_str)
45 | midstep_content = line.find('div', {'class': 'c-summary'}).text
46 | content = midstep_content.replace(source, '').replace(' ', '')
47 | content = content.replace('\xa0', '')
48 | source = source.replace('\n', '').replace('\t', '')
49 | content = content.replace('\n', '').replace('\t', '')
50 | whole_page = line.find('a', {'class': 'c-cache'})['href']
51 | fast_open_whole_page = whole_page + '&fast=y'
52 | get_whole_page = requests.get(fast_open_whole_page, headers=headers)
53 | get_whole_page.encoding = 'gb18030'
54 | whole_page_html = get_whole_page.text
55 | fetch_time = int(time.time()*1000)
56 | info_dic = {'title': title,
57 | 'url': url,
58 | 'source': source,
59 | 'content': content,
60 | 'release_time': release_time,
61 | 'keyword': keyword,
62 | 'whole_page_html': whole_page_html,
63 | 'fetch_time': fetch_time}
64 | result_lst.append(info_dic)
65 | print('get data at page %s' % print_page_num)
66 | return result_lst
67 |
68 | if __name__ == '__main__':
69 | keyword = '中超'
70 | test_data = baidu_info_page(keyword, max_page_num=10)
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_bing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Aug 22 09:30:20 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import time
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
13 |
14 |
15 | def bing_page(keyword, max_page_num):
16 | result_lst = []
17 | for page_num in range(0, max_page_num):
18 | search_url = ('https://cn.bing.com/search?q=' + keyword + '&pc=MOZI&first='
19 | + str(max_page_num*10) + '&FORM=PERE1')
20 | get_page = requests.get(search_url)
21 | get_page.encoding = 'utf-8'
22 | page = get_page.text
23 | soup = BeautifulSoup(page, 'html.parser')
24 | info_lst = soup.find_all('li', {'class': 'b_algo'})
25 | print_page_num = page_num+1
26 | for line in info_lst:
27 | title = line.h2.a.text
28 | title = title.replace('\n', '')
29 | url = line.h2.a['href']
30 | release_time_and_content = line.find('p').text
31 | release_time_and_content_lst = release_time_and_content.split('\u2002·\u2002')
32 | content = release_time_and_content_lst[-1]
33 | release_time_str = release_time_and_content_lst[0]
34 | release_time = trans_strtime_to_timestamp(release_time_str)
35 | get_whole_page_str = line.find('div', {'class': 'b_attribution'})['u']
36 | get_whole_page_lst = get_whole_page_str.split('|')
37 | d_number = get_whole_page_lst[2]
38 | w_number = get_whole_page_lst[3]
39 | get_whole_page_url = ('http://cncc.bingj.com/cache.aspx?q=' + keyword +
40 | '&d=' + d_number + '&mkt=zh-CN&setlang=zh-CN&w='
41 | + w_number)
42 | get_whole_page = requests.get(get_whole_page_url)
43 | whole_page_html = get_whole_page.text
44 | fetch_time = int(time.time()*1000)
45 | info_dic = {'title': title,
46 | 'url': url,
47 | 'content': content,
48 | 'release_time': release_time,
49 | 'keyword': keyword,
50 | 'whole_page_html': whole_page_html,
51 | 'fetch_time': fetch_time}
52 | result_lst.append(info_dic)
53 | print('get data at page %s' % print_page_num)
54 | return result_lst
55 |
56 |
57 | if __name__ == '__main__':
58 | keyword = '中超'
59 | test_data = bing_page(keyword, max_page_num=10)
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_miaopai.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Sep 28 10:28:45 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import urllib
10 | import requests
11 | import json
12 | from bs4 import BeautifulSoup
13 | from crawler_sys.utils.output_results import retry_get_url
14 | from crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
15 | from crawler.crawler_sys.utils.util_logging import logged
16 | try:
17 | from .func_get_releaser_id import *
18 | except:
19 | from func_get_releaser_id import *
20 |
21 | class Crawler_miaopai():
22 |
23 | def get_releaser_follower_num(self, releaserUrl):
24 | if "www.yixia.com" in releaserUrl:
25 | get_page = retry_get_url(releaserUrl)
26 | get_page.encoding = 'utf-8'
27 | page = get_page.text
28 | soup = BeautifulSoup(page, 'html.parser')
29 | try:
30 | midstep_1 = soup.find('ul', {'class': 'bottomInfor'})
31 | midstep_2 = midstep_1.find_all('li')
32 | for line in midstep_2:
33 | line_text = line.text
34 | if '粉丝' in line_text:
35 | follower_str = line_text.replace('粉丝', '')
36 | follower_num = trans_play_count(follower_str)
37 | print('%s follower number is %s' % (releaserUrl, follower_num))
38 | return follower_num
39 | except:
40 | print("can't can followers")
41 | elif "n.miaopai.com" in releaserUrl:
42 | try:
43 | split_url = releaserUrl.split("personal/")
44 | suid = split_url[-1].replace('.htm', '').replace('.html', '').replace('htm', '')
45 | url = "https://n.miaopai.com/api/aj_user/space.json?suid=%s" % suid
46 | get_page = urllib.request.urlopen(url)
47 | page_bytes = get_page.read()
48 | page_str = page_bytes.decode("utf-8")
49 | page_dic = json.loads(page_str)
50 | follower_num = page_dic['data']['followers_count']
51 | return follower_num
52 | except:
53 | print("can't can followers")
54 |
55 |
56 | def get_releaser_id(self, releaserUrl):
57 | return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
58 |
59 |
60 | #encoding method
61 | @logged
62 | def releaser_page(self, releaserUrl, releaser_page_num_max=30):
63 | headers = {'Host': 'n.miaopai.com',
64 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
65 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
67 | 'Accept-Encoding': 'gzip, deflate, br',
68 | 'Connection': 'keep-alive',
69 | 'Cookie': 'aliyungf_tc=AQAAAIVvfVl0CgQAysVBfBViNUJYGG5C; Hm_lvt_e8fa5926bca558076246d7fb7ca12071=1545124849; Hm_lpvt_e8fa5926bca558076246d7fb7ca12071=1545124849',
70 | 'Upgrade-Insecure-Requests': '1',
71 | 'Cache-Control': 'max-age=0'}
72 | releaser_id = self.get_releaser_id(releaserUrl)
73 | page_num = 1
74 | while page_num <= releaser_page_num_max:
75 | url = ('https://n.miaopai.com/api/aj_user/medias.json?suid=%s&page=%s'
76 | % (releaser_id, page_num))
77 | get_page = requests.get(url, headers=headers)
78 | get_page.encoding = 'utf-8'
79 | page = get_page.text
80 | page_dic = get_page.json()
81 |
82 | if __name__ == "__main__":
83 | releaserUrl = 'http://n.miaopai.com/personal/h~NjA~vSfoYLz1pchtm'
84 | test = Crawler_miaopai()
85 | p = test.get_releaser_follower_num(releaserUrl)
86 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_peoplevideo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Mar 19 10:32:52 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import requests
10 | from bs4 import BeautifulSoup
11 | import json
12 |
13 |
14 | #两会:lh/_cl/0/30/
15 | #访谈:ft/_cl/6/30/
16 | #资讯:zx/_cl/29/20/
17 | #视点:sd/_cl/29/20/
18 | #全球:qq/_cl/7/30/
19 |
20 | class people_video():
21 |
22 | def video_page(self,url):
23 | get_page=requests.get(url)
24 | get_page.encoding='utf-8'
25 | page = get_page.text
26 | midstep1=json.loads(page)
27 | midstep2=midstep1['data']['article']
28 | midstep3=midstep2['publish']
29 | title=midstep2['title']
30 | author=midstep2['author']
31 | release_time=midstep2['publishTime']
32 | playcount=midstep2['playNum']
33 | dura=midstep2['duration']
34 | duration_str=dura
35 | dl=duration_str.split(':')
36 | dl_int=[]
37 | for v in dl:
38 | v=int(v)
39 | dl_int.append(v)
40 | if len(dl_int) == 2:
41 | duration=dl_int[0]*60+dl_int[1]
42 | else:
43 | duration=dl_int[0]*3660+dl_int[1]*60+dl_int[2]
44 | releaser=midstep3['name']
45 | D0={'title':title,'playcount':playcount,'releaser':releaser,'release_time':release_time,'duration':duration,'author':author,'url':url}
46 | return D0
47 |
48 | def list_page(self,partofurl,totalpage):
49 | urls=['http://mobilevideo.people.com.cn/movie_pub/News/publishfile/'+partofurl+'list_{}.json'.format(str(i)) for i in range(1,totalpage)]
50 | list_page=[]
51 | for url in urls:
52 | get_page=requests.get(url)
53 | get_page.encoding='utf-8'
54 | page=get_page.text
55 | soup=BeautifulSoup(page,'html.parser')
56 | try:
57 | selection=soup.html.head.title.text
58 | print ('no more page')
59 | except AttributeError:
60 | print(url)
61 | midstep1=json.loads(page)
62 | midstep2=midstep1['data']['newsList']
63 | for one_line in midstep2:
64 | url=one_line['articleLink']
65 | one_video_dic=self.video_page(url)
66 | list_page.append(one_video_dic)
67 | return list_page
68 |
69 |
70 |
71 | if __name__=='__main__':
72 | people_crawler = people_video()
73 | #video_page=people_video.video_page(url='http://mobilevideo.people.com.cn/movie_pub/News/publishfile/spk/_cd/10/18/4154954.json')
74 | list_page2=people_crawler.list_page(partofurl="qq/_cl/7/30/",totalpage=20)
75 | #search_page=iqiyi_crawler.search_page(keyword="国家相册")
76 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_sogou.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Aug 22 09:30:20 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import time
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
13 |
14 |
15 | cookie = ('YYID=2FFBDAA6D4FBA37438F4067C8123E98B; IMEVER=8.5.0.1322;'
16 | 'SUID=3D03FF723865860A59795A5F000BB71F;'
17 | 'SUV=00C039A172FF033D5993ADBD770E7410; usid=lF0F7il0yWbXF5c9;'
18 | 'IPLOC=CN1100; sct=11; SMYUV=1512954490386200;'
19 | 'ad=19fxxkllll2zKxvnlllllVHr6$UllllltsDRlyllll9llllljgDll5@@@@@@@@@@;'
20 | 'SNUID=D0DE5A671A1E68C31FB628911B8277A5; wuid=AAGPcSphIAAAAAqLE2OSTQgAGwY=;'
21 | 'UM_distinctid=16449b02797449-0c5d9293f4a833-143f7040-1fa400-16449b02799881;'
22 | 'CXID=794EC592A14CE76F5DF3F3A3BDDDD787;'
23 | 'ld=Kyllllllll2bWX10QTIdJOHDsvSbWX1uK94Vhkllll9lllllVklll5@@@@@@@@@@;'
24 | 'cd=1534754086&17502a3f56c02f72dfd43a17cbb19663;'
25 | 'rd=Vyllllllll2bBEqoQLWCNCHfKv2bWX1uzX0atkllllwllllRVllll5@@@@@@@@@@;'
26 | 'LSTMV=173%2C72; LCLKINT=1570')
27 |
28 |
29 | headers = {'Host': 'news.sogou.com',
30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
32 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
33 | 'Accept-Encoding': 'gzip, deflate',
34 | 'Cookie': cookie,
35 | 'Connection': 'keep-alive',
36 | 'Upgrade-Insecure-Requests': '1',
37 | 'Cache-Control': 'max-age=0'}
38 |
39 |
40 | def sogou_info_page(keyword):
41 | result_lst = []
42 | for page_num in range(1,11):
43 | search_url = 'http://news.sogou.com/news?&query='+keyword+'&page='+str(page_num)
44 | get_page = requests.get(search_url, headers=headers)
45 | page = get_page.text
46 | soup = BeautifulSoup(page, 'html.parser')
47 | news_lst = soup.find_all('div', {'class': 'vrwrap'})
48 | for line in news_lst:
49 | try:
50 | title = line.div.h3.a.text
51 | url = line.div.h3.a['href']
52 | source_and_release_time = line.find('p', {'class': 'news-from'}).text
53 | source_and_release_time_lst = source_and_release_time.split('\xa0')
54 | source = source_and_release_time_lst[0]
55 | release_time_str = source_and_release_time_lst[-1]
56 | release_time = trans_strtime_to_timestamp(release_time_str)
57 | try:
58 | content = line.find('span').text
59 | except:
60 | print('no content at %s' % title)
61 | content = 'missing'
62 | fetch_time = int(time.time()*1000)
63 | try:
64 | similar_news = line.find('a', {'id': 'news_similar'}).text
65 | except:
66 | print('no similar news at %s' % title)
67 | similar_news = 'missing'
68 | news_info = {'title': title,
69 | 'url': url,
70 | 'source': source,
71 | 'release_time': release_time,
72 | 'fetch_time': fetch_time,
73 | 'content': content,
74 | 'similar_news': similar_news,
75 | 'keyword': keyword}
76 | result_lst.append(news_info)
77 | print('get data at page %s' % page_num)
78 | except:
79 | ('the error occured at position %s' % news_lst.index(line))
80 | return result_lst
81 |
82 |
83 | if __name__=='__main__':
84 | keyword = '中超'
85 | test_sogou = sogou_info_page(keyword)
86 |
87 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_watermelon.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon May 28 10:29:57 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 |
10 |
11 | import requests
12 | import json
13 | import datetime
14 | import re
15 | from framework.video_fields_std import Std_fields_video
16 | #from . import bulk_write_into_es
17 | import js2py
18 | import hashlib
19 | import time
20 | from selenium import webdriver
21 |
22 |
23 | class Crawler_Watermelon(Std_fields_video):
24 |
25 | def write_into_file(self, data_dict, file_obj):
26 | json_str=json.dumps(data_dict)
27 | file_obj.write(json_str)
28 | file_obj.write('\n')
29 | file_obj.flush()
30 |
31 |
32 | def feed_url_into_redis(self, dict_Lst):
33 | pass
34 |
35 |
36 | def output_result(self, result_Lst, output_to_file=False, filepath=None):
37 | # write data into es crawler-raw index
38 | #bulk_write_into_es(result_Lst)
39 |
40 | # feed url into redis
41 | self.feed_url_into_redis(result_Lst)
42 |
43 | # output into file according to passed in parameters
44 | if output_to_file==True and filepath!=None:
45 | output_fn='crawler_watermelon_%s_json' % datetime.datetime.now().isoformat()[:10]
46 | output_f=open(filepath+'/'+output_fn, 'a', encoding='utf-8')
47 | self.write_into_file(result_Lst, output_f)
48 | else:
49 | pass
50 |
51 |
52 | def get_list_video(self,output_to_file=False, filepath=None):
53 | result_Lst = []
54 | max_behot_time = 0
55 | count = 0
56 |
57 | headers = {'Host': 'ic.snssdk.com',
58 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
59 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
61 | 'Accept-Encoding': 'gzip, deflate',
62 | 'Cookie': 'odin_tt=5b54e47f71b1963502fe03c4028f5672c887a0b739ce2302481beda2a4388a0a538ade820b54b4589da13d18dde9d245',
63 | 'Connection': 'keep-alive',
64 | 'Upgrade-Insecure-Requests': '1',
65 | 'Cache-Control': 'max-age=0'}
66 |
67 |
68 | while count <= 0:
69 | time_now = int(time.time())
70 | listurl = 'http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&max_behot_time='+str(max_behot_time)+'&list_entrance=main_tab&last_refresh_sub_entrance_interval='+str(time_now)
71 | #http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&list_entrance=main_tab&last_refresh_sub_entrance_interval=1527473360&loc_mode=5&tt_from=refresh_auto&play_param=codec_type%3A0&iid=33815381012&device_id=52965120460&ac=wifi&channel=wandoujia&aid=32&app_name=video_article&version_code=653&version_name=6.5.3&device_platform=android&ab_version=359940%2C344692%2C353539%2C356329%2C361439%2C324397%2C361311%2C358091%2C358364%2C356602%2C350431%2C354439%2C325211%2C346575%2C342302%2C361530%2C320651%2C361551&ssmix=a&device_type=MuMu&device_brand=Android&language=zh&os_api=19&os_version=4.4.4&uuid=008796749793280&openudid=54767d8bf41ac9a4&manifest_version_code=253&resolution=1280*720&dpi=240&update_version_code=65307&_rticket=1527473360674&rom_version=cancro-eng+4.4.4+V417IR+eng.root.20180201.174500+release-keys&fp=i2T_FYmuPzL5Fl4ZcrU1FYFeL2FW
72 |
73 | get_page = requests.get(listurl,headers=headers)
74 | page = get_page.text
75 | page = page.replace('true','True')
76 | page = page.replace('false','False')
77 | page = page.replace('null','"Null"')
78 | page_dic = eval(page)
79 | video_agg = page_dic['data']
80 | count += 1
81 | for line in video_agg:
82 | try:
83 | video_str=line['content']
84 | video_dic=eval(video_str)
85 | if video_dic['has_video']==True:
86 | title = video_dic['title']
87 | url = video_dic['display_url']
88 | browser = webdriver.Chrome()
89 | browser.get(url)
90 | pc_midstep = browser.find_element_by_class_name('num').text
91 | play_count = ' '.join(re.findall('\d+',pc_midstep))
92 |
93 | release_time = int(video_dic['publish_time']*1e3)
94 | play_count2 = video_dic['read_count']
95 | releaser = video_dic['media_name']
96 | max_behot_time = video_dic['behot_time']
97 | video_id = video_dic['item_id']
98 | releaser_id = video_dic['user_info']['user_id']
99 | fetch_time = int(datetime.datetime.now().timestamp()*1e3)
100 |
101 | D0={'title':title,'url':url,'release_time':release_time,'releaser':releaser,'play_count':play_count,
102 | 'video_id':video_id,'releaser_id':releaser_id,'fetch_time':fetch_time,'play_count2':play_count2}
103 |
104 | result_Lst.append(D0)
105 | print ('get one video')
106 | except:
107 | pass
108 | browser.close()
109 | self.output_result(result_Lst,output_to_file=output_to_file,filepath=filepath)
110 | return result_Lst
111 | #result_Lst.clear()
112 |
113 | if __name__=='__main__':
114 | test=Crawler_Watermelon()
115 | output_to_file = True
116 | filepath = 'D:/CSM3.0/爬虫结果/watermelon'
117 | gogogo = test.get_list_video(output_to_file,filepath)
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/cookie_pool:
--------------------------------------------------------------------------------
1 | ['_T_WM=9ff1a76df3fb08baf991c83c0ecfdc93', 'SSOLoginState=1565062981', 'SUHB=0S7yXlcJlzydaX', 'SCF=AhdzjhXNYWCBvPKwVWrXOk-g-JBS9ojbzjFzZ-ppbfmgNLRAwcuir4WiJVxmdGuC4EPxXqTVzYsP2sVEbUHXojg.', 'SUB=_2A25wTIcVDeRhGeFM7lcY8ivEwzyIHXVTzildrDV6PUJbkdANLUjikW1NQN3cyJOFrdgYFfLOWkaG5qHFYk_I9rk1']
2 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/ghostdriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/crawler_weibo/ghostdriver.log
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_weibo/login.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/8/6 10:47
3 | # @Author : litao
4 | from selenium import webdriver
5 | from selenium.webdriver.common.by import By
6 | from selenium.webdriver.support.ui import WebDriverWait
7 | from selenium.webdriver.support import expected_conditions as EC
8 | import sys,os
9 |
10 | class WeiboLogin():
11 | def __init__(self, username, password):
12 | # os.system('pkill -f phantom')
13 | chrome_options = webdriver.ChromeOptions()
14 | # chrome_options.add_argument('--headless')
15 | # chrome_options.add_argument('--disable-gpu')
16 | # driver = webdriver.Remote(command_executor='http://192.168.18.11:4444/wd/hub',
17 | # desired_capabilities=DesiredCapabilities.CHROME)
18 | self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://weibo.cn/'
19 | self.browser = webdriver.Chrome(r'../chromedriver.exe', options=chrome_options)
20 | #self.browser.set_window_size(1050, 840)
21 | self.wait = WebDriverWait(self.browser, 20)
22 | self.username = username
23 | self.password = password
24 |
25 | def open(self):
26 | """
27 | 打开网页输入用户名密码并点击
28 | :return: None
29 | """
30 | self.browser.get(self.url)
31 | username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
32 | password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
33 | submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
34 | username.send_keys(self.username)
35 | password.send_keys(self.password)
36 | submit.click()
37 |
38 | def run(self):
39 | """
40 | 破解入口
41 | :return:
42 | """
43 | self.open()
44 | WebDriverWait(self.browser, 30).until(
45 | EC.title_is('我的首页')
46 | )
47 | cookies = self.browser.get_cookies()
48 | cookie = [item["name"] + "=" + item["value"] for item in cookies]
49 | cookie_str = '; '.join(item for item in cookie)
50 | self.browser.quit()
51 | return str(cookie_str)
52 |
53 | if __name__ == '__main__':
54 | user_name = '13910233534'
55 | password = 'Lemo1995'
56 | cookie_str = WeiboLogin(user_name, password).run()
57 | print(cookie_str,type(cookie_str))
58 | with open("./cookie_pool","a",encoding="utf-8") as f:
59 | f.write(cookie_str+"\n")
60 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/crawler_xinhua.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Dec 6 21:50:08 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import requests
10 |
11 | url = 'https://xhpfmapi.zhongguowangshi.com/v500/core/indexlist'
12 |
13 | headers = {"Accept-Encoding": "*",
14 | "Content-Type": "application/json; charset=UTF-8",
15 | "Content-Length": "870",
16 | "Host": "xhpfmapi.zhongguowangshi.com",
17 | "Connection": "Keep-Alive",
18 | "User-Agent": "okhttp/3.10.0"}
19 |
20 |
21 | post_dict = ('{"param":"vPCnptKLNQB9Bldt193BqMWKnVNcVmaQ4pcJ6k8iV5wmESJSidV85U3rbdOK'
22 | 'e+jmLQUDdj8Ca9QuYHizMT6l7Vq3CKOBgtNieEbRQihk10GoWKkS+L9BWI9M '
23 | 'd7Iq3IT/HL4saSsu6fkEe3PYjuA6EXKGPF6EThfdaQ3CyGPP+KhVFGp3C1Fk '
24 | '9U50pbC0HflUGTK1iH7U7A9ZKNjhNd07/U49uddfqZ8OFbZjBw4mwRij0tGP '
25 | 'F5jXpIBKoNsSnpWNTQnL86VCLd7+9jmQ+PzCkdmEiYNqRPYzXv+ihAhvBNY9 '
26 | 'yg/18dVE1+zwHR685iDwabVLWppSxacQJr7iz1uuc6O0hoIWiUBkhDcUsD8S '
27 | 'O3/DXZ8PgLMEvagO4TfS4AqMy/n5rW5UwzYr+x8jKDpr0kNjrUUH2vvXlQr6 '
28 | '+TfqwS5qHm6+nGCLoyv7HyMoOmvSraDmg3OD66rGTol/Ri9NYlSrNpyYemGG '
29 | 'fAuUuZ1pIzNeTzpF02TvzF2OQ8T2iROkmUK8iSXAuViE4I+KNKq959APFJ6Y '
30 | 'Fs+i7nfZ0d2/5jonslonNYfHqAtotAuciwEW+fqy9aZpHMRQaU5XjUeVV0Sk '
31 | 'dKiY2SveT59VTiOBgi8fy+q096BYfC+vPBp780AE0A8UnpgWz6LdyyP4GR0J '
32 | 'JTVN6F4TiPnD5mA7Lr4fpUPlVtNt8BfGKS/AkIGD+BaLVxArmlbC/6A9/caI '
33 | 'ldYWwg4yIu+CLjkDtif2NUwsqgfbgfbj/pbhPZGVzC+KjCP382OmQHKM4HoO '
34 | 'tnJueIoPIb14EwBSz98qmJ6tMBJa2BxsSVbKV076QBE7qNiJF6ZFBimSwob8 '
35 | 'upM="}')
36 |
37 | get_page = requests.post(url, data=post_dict, headers=headers, verify=False)
38 |
39 | page = get_page.json()
--------------------------------------------------------------------------------
/crawler_sys/site_crawler/toutiao_get_signature.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/4/10 12:05
3 | # @Author : litao
4 | import time
5 | import hashlib
6 | # import execjs
7 |
8 | # def get_js(id):
9 | # # f = open(r"D:\work_file\test\index.js", 'r', encoding='UTF-8')
10 | # f = open(r"D:\work_file\work_file\crawler\crawler_sys\site_crawler\toutiao_test_js.js", 'r', encoding='UTF-8')
11 | # line = f.readline()
12 | # htmlstr = ''
13 | # while line:
14 | # htmlstr = htmlstr + line
15 | # line = f.readline()
16 | # ctx = execjs.compile(str(htmlstr))
17 | # return ctx.call('get_as_cp_signature',id)
18 |
19 | def getHoney(): # 根据JS脚本破解as ,cp
20 | t = int(time.time()) # 获取当前时间
21 | # t=1534389637
22 | # print(t)
23 | e = str('%X' % t) # 格式化时间
24 | # print(e)
25 | m1 = hashlib.md5() # MD5加密
26 | m1.update(str(t).encode(encoding='utf-8')) # 转化格式
27 | i = str(m1.hexdigest()).upper() # 转化大写
28 | # print(i)
29 | n = i[0:5] # 获取前5位字符
30 | a = i[-5:] # 获取后5位字符
31 | s = ''
32 | r = ''
33 | for x in range(0, 5): # 交叉组合字符
34 | s += n[x] + e[x]
35 | r += e[x + 3] + a[x]
36 | eas = 'A1' + s + e[-3:]
37 | ecp = e[0:3] + r + 'E1'
38 | # print(eas)
39 | # print(ecp)
40 | return eas, ecp
41 |
42 |
43 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/20 13:52
3 | # @Author : litao
4 |
--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler_by_redis/chromedriver
--------------------------------------------------------------------------------
/crawler_sys/site_crawler_by_redis/toutiao_article.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/15 15:11
3 | # @Author : litao
4 | from crawler.crawler_sys.site_crawler_by_redis.crawler_toutiao import Crawler_toutiao
5 |
6 |
7 | class Crawler_toutiao_article(Crawler_toutiao):
8 | def __init__(self):
9 | super().__init__()
10 |
11 | def releaser_page_by_time(self, start_time=None, end_time=None, url=None, allow=None, **kwargs):
12 | count_false = 0
13 | for res in self.article_page(url, proxies_num=kwargs.get("proxies_num")):
14 | video_time = res["release_time"]
15 | # print(res)
16 | if video_time:
17 | if start_time < video_time:
18 | if video_time < end_time:
19 | yield res
20 | else:
21 | count_false += 1
22 | if count_false > allow:
23 | break
24 | else:
25 | yield res
26 | count_false = 0
27 | for res in self.microheadlines_page(url, proxies_num=kwargs.get("proxies_num")):
28 | video_time = res["release_time"]
29 | print(video_time)
30 | if video_time:
31 | if start_time < video_time:
32 | if video_time < end_time:
33 | yield res
34 | else:
35 | count_false += 1
36 | if count_false > allow:
37 | break
38 | else:
39 | yield res
40 |
41 | if __name__ == "__main__":
42 | test = Crawler_toutiao_article()
43 | for a in test.article_page("https://www.toutiao.com/c/user/5821222208/#mid=5821222208"):
44 | print(a)
--------------------------------------------------------------------------------
/crawler_sys/site_crawler_test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/20 13:52
3 | # @Author : litao
4 |
--------------------------------------------------------------------------------
/crawler_sys/special_propose/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue May 15 13:59:43 2018
4 |
5 | @author: hanye
6 | """
7 |
8 |
--------------------------------------------------------------------------------
/crawler_sys/special_propose/crawler.v_qq_comment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Mar 27 15:09:29 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import urllib.request
9 |
10 | def get_comment(last):
11 | comment_lst=[]
12 | while (type(last)!=bool):
13 | url = "http://coral.qq.com/article/"+str(targetid)+"/comment/v2?callback=_article"+str(targetid)+"commentv2&oriorder=o&pageflag=1&cursor="+str(last)
14 | headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')
15 | opener = urllib.request.build_opener()
16 | opener.addheaders = [headers]
17 | data = opener.open(url).read()
18 | page=str(data,encoding='utf-8')
19 | useful_infor=page.split('commentv2')[1]
20 | str_to_dic=useful_infor[1:-1]
21 | str_to_dic=str_to_dic.replace('true','True')
22 | str_to_dic=str_to_dic.replace('false','False')
23 | dic_all_infor=eval(str_to_dic)
24 | ttt=dic_all_infor['data']
25 | last=ttt['last']
26 | print(last)
27 | repcomment=ttt['repCommList']
28 | if type(repcomment)==list:
29 | repcomment_lst=repcomment
30 | print('repcomment')
31 | elif type(repcomment)==dict:
32 | list(repcomment.values())
33 | print('repcomment')
34 | else:
35 | repcomment=None
36 | if repcomment!=None:
37 | for yyy in repcomment_lst:
38 | if type(yyy)==list:
39 | for uu in yyy:
40 | content=uu['content']
41 | parent=uu['parent']
42 | publishdate=uu['time']
43 | userid=uu['userid']
44 | upcount=uu['up']
45 | contentid=uu['id']
46 | dadorson=2
47 | D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson}
48 | comment_lst.append(D2)
49 | else:
50 | content=uu['content']
51 | parent=uu['parent']
52 | publishdate=uu['time']
53 | userid=uu['userid']
54 | upcount=uu['up']
55 | contentid=uu['id']
56 | dadorson=2
57 | D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson}
58 | comment_lst.append(D2)
59 | else:
60 | print('no repcomment')
61 | comment=ttt['oriCommList']
62 | for zzz in comment:
63 | content=zzz['content']
64 | contentid=zzz['id']
65 | upcount=zzz['up']
66 | publishdate=zzz['time']
67 | userid=zzz['userid']
68 | parent=contentid
69 | dadorson=1
70 | D0={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'contentid':contentid,'parent':parent,'dadorson':dadorson}
71 | comment_lst.append(D0)
72 | print('get all comment')
73 | return comment_lst
74 |
75 | if __name__=='__main__':
76 | last=0
77 | targetid=2426229062
78 | comment=get_comment(last)
--------------------------------------------------------------------------------
/crawler_sys/special_propose/crawler.v_qq_danmu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Mar 26 13:50:00 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import requests
9 |
10 |
11 | def danmu(x,jyid,targetid):
12 | danmu_lst=[]
13 | pagenum_lst=[]
14 | for i in range(0,x):
15 | ttt=15*(1+i)
16 | pagenum_lst.append(ttt)
17 | for pagenum in pagenum_lst:
18 | try:
19 | url='https://mfm.video.qq.com/danmu?otype=json&callback=jQuery'+jyid+'×tamp='+str(pagenum)+'&target_id='+targetid+'&count=500&second_count=6&session_key=0%2C0%2C0'
20 | get_page=requests.get(url)
21 | get_page.encoding='utf-8'
22 | page=get_page.text
23 | length=len(jyid)+7
24 | prepage=page[length:-1]
25 | prepage=prepage.replace('\r','')
26 | prepage=prepage.replace('/n','')
27 | dicdicdic=eval(prepage)
28 | danmu_count=dicdicdic['count']
29 | if danmu_count>1000:
30 | print(danmu_count)
31 | print(pagenum)
32 | print('get one page')
33 | get_danmu_lst=dicdicdic['comments']
34 | for danmu in get_danmu_lst:
35 | commentid=danmu['commentid']
36 | content=danmu['content']
37 | timepoint=danmu['timepoint']
38 | upcount=danmu['upcount']
39 | opername=danmu['opername']
40 | D0={'commentid':commentid,'content':content,'timepoint':timepoint,'upcount':upcount,'opername':opername}
41 | danmu_lst.append(D0)
42 | except SyntaxError:
43 | print(str(pagenum)+'there is sth wrong')
44 | return danmu_lst
45 |
46 | if __name__=='__main__':
47 | x=int((1*3600+31*60+28)/15)
48 | jyid='19103025125001255282_1522399545358'
49 | targetid='2434347230'
50 | video_data777=danmu(x,jyid,targetid)
51 | #one
52 | #1 33 13
53 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104770781474841782_1522044823510×tamp=15&target_id=2431410170&count=80&second_count=6&session_key=0%2C0%2C0&_=1522044823541
54 |
55 | #two
56 | #1 31 28
57 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910006379066561103097_1522048580301×tamp=15&target_id=2432862868&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048580319
58 |
59 | #three
60 | #1 31 28
61 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104977942731832877_1522048936005×tamp=45&target_id=2434347230&count=80&second_count=6&session_key=178328%2C326%2C1522048940&_=1522048936017
62 |
63 | #four
64 | #1 31 12
65 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191007914957214696439_1522048988303×tamp=15&target_id=2464055709&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048988318
66 |
67 | #five
68 | #1 34 47
69 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19109570751887462264_1522049153524×tamp=45&target_id=2479936974&count=80&second_count=6&session_key=95212%2C150%2C1522049156&_=1522049153539
70 |
71 | #six
72 | #1 31 27
73 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910025632186610303198_1522050281547×tamp=135&target_id=2497027899&count=80&second_count=6&session_key=135654%2C180%2C1522050285&_=1522050281568
74 |
75 | #seven
76 | #1 31 22
77 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191022889623011170301_1522050238126×tamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522050238141
78 |
79 |
80 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19105157512767429676_1522391911686×tamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522391911700
81 |
82 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19103025125001255282_1522399545358×tamp=105&target_id=2434347230&count=80&second_count=6&session_key=186386%2C332%2C1522399582&_=1522399545371
83 |
--------------------------------------------------------------------------------
/crawler_sys/special_propose/eastnews.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Mar 13 15:21:47 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import requests
9 | from bs4 import BeautifulSoup
10 | import re
11 | import datetime
12 | import pickle
13 | import pandas as pd
14 |
15 | class Crawler_v_qq_eastnews:
16 |
17 | def video_page(self, url):
18 | get_page=requests.get(url)
19 | get_page.encoding='utf-8'
20 | page = get_page.text
21 | soup = BeautifulSoup(page,'html.parser')
22 | try:
23 | title=soup.find('h1',{'class':'video_title _video_title'}).text
24 | title=title.replace('\n','')
25 | title=title.replace('\t','')
26 | except AttributeError:
27 | title=None
28 | try:
29 | releaser=soup.find('span',{'class':'user_name'}).text
30 | except:
31 | releaser=None
32 | try:
33 | releaserUrl=soup.find('a',{'class':'user_info'})['href']
34 | except TypeError:
35 | releaserUrl=None
36 | try:
37 | video_intro=soup.find('meta',{'itemprop':'description'})['content']
38 | except TypeError:
39 | video_intro=None
40 | try:
41 | midstep = soup.find("script",{"r-notemplate":"true"}).text
42 | try:
43 | duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1]
44 | except IndexError:
45 | duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1]
46 | duration=duration.replace('"','')
47 | duration=int(duration)
48 | except:
49 | print('Catched exception, didn\'t find duartion in var VIDEO_INFO')
50 | duration=0
51 | try:
52 | playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1]
53 | except:
54 | print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO')
55 | playcount=0
56 | retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0]
57 | try:
58 | release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3)
59 | except ValueError:
60 | release_time=0
61 | except:
62 | duration=None
63 | playcount=None
64 | release_time=None
65 | D0={'title':title,'playcount':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl}
66 | return D0
67 |
68 | def search_page(self,totalpage):
69 | video_Lst=[]
70 | url_Lst=[]
71 | page_Lst=['https://v.qq.com/x/search/?ses=qid%3D_5hveCy5oWKS_b5d4GuLquXTO29F8LJnLcmNDpNkXFkeEr8UDB0g9g%26last_query%3D%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB%26tabid_list%3D0%7C11%7C8%7C7%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%96%B0%E9%97%BB%7C%E5%8E%9F%E5%88%9B%7C%E5%85%B6%E4%BB%96&q=%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB&stag=3&cur={}&cxt=tabid%3D0%26sort%3D1%26pubfilter%3D0%26duration%3D3'.format(str(i)) for i in range(1,totalpage)]
72 | for page_url in page_Lst:
73 | get_page=requests.get(page_url)
74 | print (page_url)
75 | get_page.encoding='utf-8'
76 | page = get_page.text
77 | soup = BeautifulSoup(page,'html.parser')
78 | tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" })
79 | for data_line in tencent:
80 | try:
81 | ttt=data_line.find('span',{'title':'东方新闻'}).text
82 | except AttributeError:
83 | ttt=None
84 | if ttt==None:
85 | urls=None
86 | else:
87 | urls=data_line.h2.a['href']
88 | get_page=requests.get(urls)
89 | print (urls)
90 | get_page.encoding='utf-8'
91 | page = get_page.text
92 | soup = BeautifulSoup(page,'html.parser')
93 | fff=soup.find_all('a',{'class':'figure_detail'})
94 | for zzz in fff:
95 | urls1=zzz['href']
96 | urls2='https://v.qq.com'+urls1
97 | url_Lst.append(urls2)
98 | for url in url_Lst:
99 | dicdic = self.video_page(url)
100 | dicdic['url']=url
101 | print(url)
102 | video_Lst.append(dicdic)
103 | return video_Lst
104 |
105 |
106 | if __name__=='__main__':
107 | v_qq_crawler = Crawler_v_qq_eastnews()
108 | search_page2=v_qq_crawler.search_page(totalpage=20)
--------------------------------------------------------------------------------
/crawler_sys/special_propose/get_weibo_user_info.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu May 31 09:20:25 2018
4 |
5 | 龟速爬取 需要提速
6 |
7 | @author: fangyucheng
8 | """
9 |
10 |
11 | from selenium import webdriver
12 | import re
13 | import json
14 | import time
15 |
16 |
17 |
18 | class Crawler_Get_Weibo_User_Info():
19 |
20 |
21 | def from_file_to_list(self,filename):
22 | openfile = open(filename)
23 | task = []
24 | for line in openfile:
25 | line_dic = line.replace('\n','')
26 | task.append(line_dic)
27 | return task
28 |
29 |
30 |
31 | def get_user_info(self,filename,resultname):
32 | result = open(resultname,'a')
33 | result_lst = []
34 | task_lst = self.from_file_to_list(filename)
35 | browser = webdriver.Chrome()
36 | for url in task_lst:
37 | try:
38 | browser.get(url)
39 | browser.maximize_window()
40 | time.sleep(8)
41 | user_name = browser.find_element_by_class_name('username').text
42 | print('get user_name')
43 | user_info = browser.find_element_by_class_name('info').text
44 | print('get user_info')
45 | try:
46 | industry_detail_intro = browser.find_element_by_class_name('ul_detail').text
47 | pattern1 = '行业类别'
48 | pattern2 = '简介'
49 | pattern3 = '毕业于'
50 | try:
51 | industry = ' '.join(re.findall('行业类别.*',industry_detail_intro)).replace(pattern1,'').replace(' ','')
52 | print('get industry')
53 | except:
54 | industry = None
55 | try:
56 | detail_intro = ' '.join(re.findall('简介.*',industry_detail_intro)).replace(pattern2,'').replace(' ','').replace(':','')
57 | print('get detail_intro')
58 | except:
59 | detail_intro = None
60 | try:
61 | graduated_from = ' '.join(re.findall('毕业于.*',industry_detail_intro)).replace(pattern3,'').replace(' ','')
62 | print('get graduated_from')
63 | except:
64 | graduated_from = None
65 | except:
66 | pass
67 | followers_fans = browser.find_element_by_class_name('tb_counter').text.split('\n')
68 | followers = followers_fans[0]
69 | print('get followers')
70 | fans = followers_fans[2]
71 | print('get fans')
72 | weibo_num = followers_fans[4]
73 | print('get weibo_num')
74 | D0 = {'user_name':user_name,
75 | 'user_info':user_info,
76 | 'industry':industry,
77 | 'detail_intro':detail_intro,
78 | 'followers':followers,
79 | 'fans':fans,
80 | 'weibo_num':weibo_num,
81 | 'url':url,
82 | 'graduated_from':graduated_from}
83 | print('get one user')
84 | result_lst.append(D0)
85 | json_D0 = json.dumps(D0)
86 | result.write(json_D0)
87 | result.write('\n')
88 | result.flush()
89 | except:
90 | pass
91 | return result_lst
92 |
93 |
94 |
95 | if __name__=='__main__':
96 | test = Crawler_Get_Weibo_User_Info()
97 | filename='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_list_try.txt'
98 | resultname='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_info_20180531_2'
99 | firstV3 = test.get_user_info(filename,resultname)
--------------------------------------------------------------------------------
/crawler_sys/special_propose/tocsv.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Mar 28 01:12:09 2018
4 |
5 | @author: Administrator
6 | """
7 |
8 |
9 | import pandas as pd
10 | ttt=pd.DataFrame(comment)
11 | ttt['heiheihei']="'"
12 | ttt['id']=ttt['heiheihei']+ttt['contentid']
13 | ttt['real_time']=pd.to_datetime(ttt['publishdate'],unit='s')
14 | try:
15 | ttt.to_csv('wuwuwu.csv',encoding='utf-8',index=False)
16 | except UnicodeEncodeError:
17 | pass
18 |
--------------------------------------------------------------------------------
/crawler_sys/special_propose/whether_user_exists.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jun 15 09:49:15 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | from selenium import webdriver
9 | from crawler_sys.utils.Metaorphosis import Meta
10 | import time
11 | import json
12 |
13 |
14 | def open_url(filename,resultname):
15 | result_file = open(resultname,'a')
16 | result_lst = []
17 | browser = webdriver.Chrome()
18 | browser.maximize_window()
19 | time.sleep(60)
20 | meta = Meta()
21 | url_lst = meta.str_file_to_lst(filename)
22 | for url in url_lst:
23 | browser.get(url)
24 | time.sleep(6)
25 | try:
26 | user_name = browser.find_element_by_class_name('username').text
27 | except:
28 | user_name = None
29 | D0 = {'url':url,'user_name':user_name}
30 | json_D0 = json.dumps(D0)
31 | result_file.write(json_D0)
32 | result_file.write('\n')
33 | result_file.flush()
34 | result_lst.append(D0)
35 | return result_lst
36 |
37 |
38 | if __name__=='__main__':
39 | filename = 'D:/CSM3.0/打杂/whether_exists.txt'
40 | resultname = 'D:/CSM3.0/打杂/whether_exists'
41 | result=open_url(filename,resultname)
42 |
--------------------------------------------------------------------------------
/crawler_sys/tools/add_releasers_video_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Sep 6 09:22:24 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
10 | from crawler.crawler_sys.utils import trans_format
11 |
12 | def get_target_releaser_video_info(file_name,
13 | output_to_es_raw=True,
14 | es_index=None,
15 | doc_type=None,
16 | releaser_page_num_max=10000):
17 | start_time = int(time.time()*1e3)
18 | task_lst = trans_format.csv_to_lst_with_headline(file_name)
19 | for line in task_lst:
20 | releaserUrl = line['releaserUrl']
21 | platform = line['platform']
22 | crawler = get_crawler(platform=platform)
23 | crawler_initialization = crawler()
24 | if platform == 'haokan':
25 | try:
26 | crawler_initialization.releaser_page(releaserUrl=releaserUrl,
27 | releaser_page_num_max=releaser_page_num_max,
28 | output_to_es_raw=True,
29 | es_index=es_index,
30 | doc_type=doc_type,
31 | fetchFavoriteCommnt=False)
32 | except:
33 | print(releaserUrl)
34 | else:
35 | try:
36 | crawler_initialization.releaser_page(releaserUrl=releaserUrl,
37 | releaser_page_num_max=releaser_page_num_max,
38 | output_to_es_raw=True,
39 | es_index=es_index,
40 | doc_type=doc_type)
41 | except:
42 | print(releaserUrl)
43 | end_time = int(time.time()*1e3)
44 | time_info = [start_time, end_time]
45 | return time_info
46 |
47 | if __name__ =='__main__':
48 | get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv',
49 | releaser_page_num_max=1000,
50 | es_index='crawler-data-raw',
51 | doc_type='doc'
52 | )
--------------------------------------------------------------------------------
/crawler_sys/tools/add_releasers_video_data_multi_process.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Sep 6 09:22:24 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | from multiprocessing import Pool
10 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
11 | from crawler.crawler_sys.utils import trans_format
12 | from elasticsearch import Elasticsearch
13 | from elasticsearch.helpers import scan
14 |
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user = 'zhouyujiang'
18 | passwd = '8tM9JDN2LVxM'
19 | http_auth = (user, passwd)
20 |
21 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
22 |
23 | def func_search_reUrl_from_target_index(platform, releaser):
24 | search_body = {
25 | "query": {
26 | "bool": {
27 | "filter": [
28 | {"term": {"platform.keyword": platform}},
29 | {"term": {"releaser.keyword": releaser}}
30 | ]
31 | }
32 | }
33 | }
34 | search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
35 | if search_re['hits']['total'] > 0:
36 | return search_re['hits']['hits'][0]['_source']['releaserUrl']
37 | else:
38 | print('Can not found:', platform, releaser)
39 | return None
40 |
41 | def get_target_releaser_video_info(file_name,
42 | output_to_es_raw=True,
43 | es_index=None,
44 | doc_type=None,
45 | releaser_page_num_max=10000):
46 | start_time = int(time.time()*1e3)
47 | task_lst = trans_format.csv_to_lst_with_headline(file_name)
48 | pool = Pool(10)
49 | arg_dict = {"releaser_page_num_max": releaser_page_num_max,
50 | "output_to_es_raw": True,
51 | "es_index": es_index,
52 | "doc_type": doc_type}
53 | for line in task_lst:
54 | platform = line['platform']
55 | releaser = line['releaser']
56 | try:
57 | releaserUrl = line["releaserUrl"]
58 | except:
59 | releaserUrl = func_search_reUrl_from_target_index(platform, releaser)
60 | print("releaserUrl",releaserUrl)
61 | crawler_initialization = get_crawler(platform=platform)
62 | try:
63 | crawler = crawler_initialization().search_page
64 | pool.apply_async(crawler, args=(releaserUrl, ), kwds=arg_dict)
65 | except:
66 | continue
67 | pool.close()
68 | pool.join()
69 | end_time = int(time.time()*1e3)
70 | time_info = [start_time, end_time]
71 | return time_info
72 |
73 | if __name__ =='__main__':
74 | get_time = get_target_releaser_video_info(file_name=r'C:\Users\litao\Desktop\target_releasers - key_custom.csv',
75 | releaser_page_num_max=300,
76 | es_index='crawler-data-raw',
77 | doc_type='doc')
--------------------------------------------------------------------------------
/crawler_sys/tools/add_search_keywords_to_es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/7/19 11:29
3 | # @Author : litao
4 |
5 | import json
6 | import datetime
7 | import elasticsearch
8 | import hashlib
9 | import csv
10 | hosts = '192.168.17.11'
11 | port = 80
12 | user = 'zhouyujiang'
13 | passwd = '8tM9JDN2LVxM'
14 | http_auth = (user, passwd)
15 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
16 |
17 |
18 | def write_es(Lst):
19 | count = 0
20 | bulk_all_body = ""
21 | doc_id_type = "all-time-url"
22 | header_Lst = Lst[0]
23 | linec = 1
24 | sha1 = hashlib.sha1()
25 | for line in Lst:
26 | if linec == 1:
27 | linec += 1
28 | continue
29 | linec += 1
30 | print(linec)
31 | line_dict = dict(zip(header_Lst, line))
32 | dic = {
33 | "title": line_dict["title"],
34 | "timestamp": int(datetime.datetime.now().timestamp() * 1e3),
35 | "platform": line_dict["platform"],
36 | "page": line_dict["page"],
37 | }
38 |
39 | sha1.update((line_dict["title"]+line_dict["platform"]).encode("utf8"))
40 | bulk_head = '{"index": {"_id":"%s"}}' % sha1.hexdigest()
41 | data_str = json.dumps(dic, ensure_ascii=False)
42 | bulk_one_body = bulk_head + '\n' + data_str + '\n'
43 | bulk_all_body += bulk_one_body
44 | count += 1
45 | if count % 500 == 0:
46 | eror_dic = es.bulk(index=target_index, doc_type=target_type,
47 | body=bulk_all_body, request_timeout=500)
48 | bulk_all_body = ''
49 | if eror_dic['errors'] is True:
50 | print(eror_dic['items'])
51 | print(bulk_all_body)
52 | print(count)
53 |
54 | if bulk_all_body != '':
55 | eror_dic = es.bulk(body=bulk_all_body,
56 | index=target_index,
57 | doc_type=target_type,
58 | request_timeout=500)
59 | if eror_dic['errors'] is True:
60 | print(eror_dic)
61 | bulk_all_body = ''
62 | # print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
63 |
64 |
65 | if __name__ == '__main__':
66 | target_index = 'search_keywords'
67 | target_type = 'doc'
68 |
69 | m3 = open(r"D:\work_file\发布者账号\一次性需求附件\keywords.csv", "r", encoding="gb18030")
70 | file = csv.reader(m3)
71 | data = list(file)
72 | write_es(data)
73 |
--------------------------------------------------------------------------------
/crawler_sys/tools/add_target_releasers_by_file.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Jun 13 11:57:40 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import elasticsearch
9 | import json
10 | import time
11 | from crawler_sys.utils.releaser_url_check import test_releaserUrl
12 | from crawler_sys.utils import trans_format
13 |
14 |
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user_id = 'fangyucheng'
18 | password = 'VK0FkWf1fV8f'
19 | http_auth = (user_id, password)
20 | lose_re_url = []
21 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
22 |
23 | test_lst = trans_format.csv_to_lst_with_headline('F:/add_target_releaser/album_playcnt/album_playcnt_002.csv')
24 | task_lst = []
25 |
26 | for line in test_lst:
27 | if line['releaserUrl'] is not None:
28 | task_lst.append(line)
29 |
30 | bulk_all_body = ''
31 |
32 | poster = 'fangyucheng'
33 | test_re = test_releaserUrl(task_lst)
34 |
35 | for one_re in test_re:
36 | if one_re['True_or_False'] == 1:
37 | line_dic = {}
38 | post_by = poster
39 | post_time = int(time.time() * 1000)
40 | timestamp = int(time.time() * 1000)
41 | releaserUrl = one_re['releaserUrl']
42 | platform = one_re['platform']
43 | releaser = one_re['releaser']
44 | try:
45 | album_play_count = one_re['album_play_count']
46 | except:
47 | album_play_count = None
48 | _id = platform + '_' + releaser
49 |
50 | bulk_head = '{"index": {"_id":"%s"}}' % _id
51 | line_dic['is_valid'] = True
52 | line_dic['platform'] = platform
53 | line_dic['post_by'] = post_by
54 | if album_play_count is not None:
55 | line_dic['album_play_count'] = album_play_count
56 | line_dic['post_time'] = post_time
57 | line_dic['releaser'] = releaser
58 | line_dic['releaserUrl'] = releaserUrl
59 | line_dic['timestamp'] = timestamp
60 | data_str=json.dumps(line_dic, ensure_ascii=False)
61 | bulk_one_body = bulk_head + '\n' + data_str + '\n'
62 | bulk_all_body += bulk_one_body
63 | es.bulk(index='target_releasers', doc_type='doc',
64 | body=bulk_all_body, request_timeout=200)
65 | bulk_all_body = ''
66 | print('success')
67 |
68 |
--------------------------------------------------------------------------------
/crawler_sys/tools/count_releaser_publish_num_and_refresh_crawler_frequency.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Dec 11 11:59:55 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import json
9 | import time
10 | import elasticsearch
11 | import elasticsearch.helpers
12 |
13 | hosts = '192.168.17.11'
14 | port = 80
15 | user_id = 'fangyucheng'
16 | password = 'VK0FkWf1fV8f'
17 | http_auth = (user_id, password)
18 |
19 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
20 |
21 | task_list = []
22 | result_list = []
23 |
24 | es_scan = elasticsearch.helpers.scan(es, index='target_releasers')
25 |
26 | for line in es_scan:
27 | task_list.append(line)
28 | print('the length of releaser is %s' % len(task_list))
29 |
30 | bulk_all_body = ''
31 | count = 0
32 | for line in task_list:
33 | releaser_info = line['_source']
34 | platform = releaser_info['platform']
35 | releaser = releaser_info['releaser']
36 | search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":platform}},
37 | {"term":{"releaser.keyword":releaser}},
38 | {"term":{"data_month":11}},
39 | {"term":{"data_year":2018}},
40 | {"term":{"stats_type.keyword":"new_released"}}]}}}
41 |
42 | es_search = es.search(index='releaser', doc_type='releasers',
43 | body=search_body)
44 | if es_search['hits']['total'] != 0:
45 | hits = es_search['hits']['hits'][0]['_source']['video_num']
46 | releaser_info['Nov_2018'] = int(hits)
47 | print("releaser %s hit %s video in es" % (releaser, hits))
48 | else:
49 | releaser_info['Nov_2018'] = 0
50 | task_list.remove(line)
51 | total = releaser_info['Nov_2018']
52 | if total >= 900:
53 | releaser_info['frequency'] = 9
54 | print("%s frequency is 3" % releaser_info['releaser'])
55 | if total >= 300:
56 | releaser_info['frequency'] = 3
57 | print("%s frequency is 3" % releaser_info['releaser'])
58 | count += 1
59 | else:
60 | releaser_info['frequency'] = 1
61 | _id = platform + '_' + releaser
62 | bulk_head = '{"index": {"_id":"%s"}}' % _id
63 | releaser_info['timestamp'] = int(time.time() * 1e3)
64 | data_str = json.dumps(releaser_info, ensure_ascii=False)
65 | bulk_one_body = bulk_head+'\n'+data_str+'\n'
66 | bulk_all_body += bulk_one_body
67 | es.bulk(index='target_releasers', doc_type='doc',
68 | body=bulk_all_body)
69 | bulk_all_body = ''
70 | print('write %s into es' % releaser)
71 |
--------------------------------------------------------------------------------
/crawler_sys/tools/crawler_url_video_info.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | 根据 url 抓取 页面的播放量等信息
4 |
5 | @author: zhouyujiang
6 | """
7 |
8 | import time
9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
10 | from crawler.crawler_sys.utils import trans_format
11 |
12 |
13 |
14 | def get_target_video_info(platform, url):
15 | crawler = get_crawler(platform=platform)
16 | crawler_initialization = crawler()
17 | new_playcount = crawler_initialization.check_play_count_by_video_page(url=url)
18 | return new_playcount
19 |
20 |
21 |
22 |
23 |
24 |
25 | #if __name__ =='__main__':
26 | # get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv',
27 | # releaser_page_num_max=1000,
28 | # es_index='crawler-data-raw',
29 | # doc_type='doc'
30 | # )
--------------------------------------------------------------------------------
/crawler_sys/tools/error.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/error.log
--------------------------------------------------------------------------------
/crawler_sys/tools/find_high_fre_releaser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Oct 23 10:03:53 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import configparser
9 | from elasticsearch import Elasticsearch
10 |
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
17 |
18 | search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":"haokan"}}]}},
19 | "sort":[{"Nov_2018":{"order":"desc"}}]}
20 |
21 | es_search = es_connection.search(index='target_releasers',
22 | doc_type='doc',
23 | body=search_body, size=1000)
24 |
25 | es_data_lst = es_search['hits']['hits']
26 |
27 | result_list = []
28 |
29 | for line in es_data_lst:
30 | data_dic = line['_source']
31 | result_list.append(data_dic)
32 |
33 | new_list = result_list[:40]
34 |
35 | result_list = []
36 |
37 | releaser_dic = {}
38 | for line in new_list:
39 | releaser_dic[line['releaser']] = line['releaserUrl']
40 |
41 |
42 | config = configparser.ConfigParser()
43 | config['haokan'] = releaser_dic
44 |
45 | with open ('high_fre.ini', 'w', encoding='utf-8') as ini:
46 | config.write(ini)
47 |
48 |
49 |
50 | #special task
51 | #for line in source_lst:
52 | # detail_lst = line['detail']
53 | # csm_mdu = detail_lst[0]['csm_mdu']
54 | # for detail_dic in detail_lst:
55 | # detail_dic.pop('csm_mdu')
56 | # line['csm_mdu'] = csm_mdu
--------------------------------------------------------------------------------
/crawler_sys/tools/match_play_count_by_video_title.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 23 13:48:33 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | from elasticsearch import Elasticsearch
9 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
10 |
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
17 |
18 | unsolve_lst = []
19 | result_lst2 =[]
20 |
21 | for line in task_list:
22 | url = line['url']
23 | title = line['title']
24 | search_body = {"query": {"bool": {"filter": [{"term": {"title.keyword": title}}]}}}
25 | search = es_connection.search(index="test2", doc_type="fyc1210", body=search_body)
26 | if search["hits"]["total"] == 0:
27 | unsolve_lst.append(url)
28 | print("can not get video data at %s" % url)
29 | else:
30 | video_data = search["hits"]["hits"][0]["_source"]
31 | result_lst2.append(video_data)
32 | print("get playcount at %s" % url)
33 |
34 | lst_to_csv(listname=result_lst2,
35 | csvname="F:/add_target_releaser/Nov/Sep2.csv")
--------------------------------------------------------------------------------
/crawler_sys/tools/match_play_count_by_video_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 23 14:44:07 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import elasticsearch.helpers
9 | from elasticsearch import Elasticsearch
10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
11 | from crawler.crawler_sys.utils.trans_format import str_file_to_lst
12 | from crawler.crawler_sys.utils.trans_format import str_lst_to_file
13 | #from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline
14 |
15 | hosts = '192.168.17.11'
16 | port = 80
17 | user_id = 'fangyucheng'
18 | password = 'VK0FkWf1fV8f'
19 | http_auth = (user_id, password)
20 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
21 |
22 |
23 | def init_task_list(file_path=None):
24 | task_list = []
25 | if file_path is None:
26 | es_scan = elasticsearch.helpers.scan(es_connection, index='album-play-count')
27 | for line in es_scan:
28 | video_dict = line['_source']
29 | task_list.append(video_dict)
30 | return task_list
31 | else:
32 | task_list = str_file_to_lst(file_path)
33 | return task_list
34 |
35 | unsolve_lst = []
36 | result_lst2 =[]
37 |
38 |
39 | task_list = str_file_to_lst('F:/add_target_releaser/album_play_count/dec')
40 | #task_list = init_task_list()
41 |
42 |
43 | for line in task_list:
44 | try:
45 | if type(line) == dict:
46 | url = line['url']
47 | elif type(line) == str:
48 | url = line
49 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
50 | search = es_connection.search(index="test2", doc_type="dec", body=search_body)
51 | if search["hits"]["total"] == 0:
52 | unsolve_lst.append(url)
53 | print("can not get video data at %s" % url)
54 | else:
55 | video_data = search["hits"]["hits"][0]["_source"]
56 | result_lst2.append(video_data)
57 | print("get playcount at %s" % url)
58 | except:
59 | pass
60 |
61 | lst_to_csv(listname=result_lst2,
62 | csvname="F:/add_target_releaser/last_month/fix_play_count12242.csv")
63 | str_lst_to_file(unsolve_lst,
64 | filename="F:/add_target_releaser/last_month/unsolved")
65 |
--------------------------------------------------------------------------------
/crawler_sys/tools/ocr_by_aliyun.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/29 15:52
3 | # @Author : litao
4 |
5 |
6 | import os,re
7 | from aliyunsdkcore.client import AcsClient
8 | from aliyunsdkcore.acs_exception.exceptions import ClientException
9 | from aliyunsdkcore.acs_exception.exceptions import ServerException
10 | from aliyunsdkocr.request.v20191230.RecognizeCharacterRequest import RecognizeCharacterRequest
11 | from crawler.crawler_sys.framework.config.oss_keyword import AccessKeyId,AccessKeySecret
12 | import oss2
13 | from viapi.fileutils import FileUtils
14 |
15 |
16 | endpoint = "oss-cn-beijing.aliyuncs.com"
17 | # region = "v-plus-scope.oss-cn-beijing.aliyuncs.com"
18 | region = "v-plus-scope"
19 | oss_url_expries = 3600
20 | access_key_id = AccessKeyId
21 | access_key_secret = AccessKeySecret
22 | bucket_name = "v-plus-scope"
23 | auth = oss2.Auth(access_key_id, access_key_secret)
24 | bucket = oss2.Bucket(auth, endpoint, bucket_name)
25 | client = AcsClient(AccessKeyId, AccessKeySecret, 'cn-shanghai')
26 |
27 |
28 | def put_obj(access_key_id, access_key_secret, region, bucket_name, object_name):
29 | fileobj_content = open("ocr_img/")
30 | # oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR)
31 | auth = oss2.Auth(access_key_id, access_key_secret)
32 | bucket = oss2.Bucket(auth, region, bucket_name)
33 | bucket.put_object(object_name, fileobj_content)
34 | file_utils = FileUtils(AccessKeyId, AccessKeySecret)
35 | oss_url = file_utils.get_oss_url("http://xxx.jpeg", "jpg", False)
36 |
37 | def ocr_from_aliyun(file_name=""):
38 | request = RecognizeCharacterRequest()
39 | request.set_accept_format('json')
40 | path_name = "ocr_img/%s"%file_name
41 | put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name,)
42 | request.set_ImageURL("http://explorer-image.oss-cn-shanghai.aliyuncs.com/270450672578492833/2020-05-01+200210.png?OSSAccessKeyId=LTAI4Fk9FstqSEYnqKJ5Dpeo&Expires=1590740750&Signature=ZggX6U2%2F3WvpSUpR9P8EYrD0vbQ%3D")
43 | request.set_MinHeight(15)
44 | request.set_OutputProbability(True)
45 |
46 | response = client.do_action_with_exception(request)
47 | # python2: print(response)
48 | print(str(response, encoding='utf-8'))
49 |
50 |
51 | def file_path_scan(file_path):
52 | for filename in os.listdir(file_path):
53 | path = os.path.join(file_path, filename)
54 | if not os.path.isfile(path):
55 | continue
56 | title = img_to_str(path, lang=Languages.CHS)
57 | print(title)
58 | try:
59 | play_count = re.findall("\d+",title)[0]
60 | #print(play_count)
61 | except:
62 | #print(title)
63 | play_count= 0
64 | yield filename,play_count
65 |
66 |
67 | file_path = r'D:\work_file\word_file_new\litao\num'
68 | for filename,play_count in file_path_scan(file_path):
69 | time_str = filename.replace(".png","")
70 | time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:]
71 | # print(time_str)
72 | print(time_str,play_count)
--------------------------------------------------------------------------------
/crawler_sys/tools/ocr_by_img.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/29 10:11
3 | # @Author : litao
4 |
5 | from PIL import Image
6 | import pytesseract,os,re
7 | import cv2
8 | import argparse
9 | import cv2
10 | import os
11 | # construct the argument parse and parse the arguments
12 | ap = argparse.ArgumentParser()
13 | ap.add_argument("-p", "--preprocess", type=str, default="thresh",
14 | help="type of preprocessing to be done")
15 | args = vars(ap.parse_args())
16 | class Languages:
17 | CHS = 'chi_sim'
18 | ENG = 'eng'
19 |
20 | def img_to_str(image_path, lang=Languages.CHS):
21 | # img = Image.open(image_path)
22 | # width, height = img.size
23 | # img.show()
24 | # mode = img.mode
25 |
26 | # print(img.size)
27 | # thumb = img.crop((10,42,160,150))
28 | # img.grab(0,0,250,200)
29 | # thumb.save("thumb.jpg")
30 | # image = cv2.imread(image_path)
31 | # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
32 | # # check to see if we should apply thresholding to preprocess the
33 | # # image
34 | # if args["preprocess"] == "thresh":
35 | # gray = cv2.threshold(gray, 0, 255,
36 | # cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
37 | # # make a check to see if median blurring should be done to remove
38 | # # noise
39 | # elif args["preprocess"] == "blur":
40 | # gray = cv2.medianBlur(gray, 3)
41 | # # write the grayscale image to disk as a temporary file so we can
42 | # # apply OCR to it
43 | # filename = "thumb.png"
44 | # cv2.imwrite(filename, gray)
45 | # thumb = img.crop((40, 30, 100, 70))
46 | #img.grab((30, 30, 150, 80))
47 | # thumb.save("thumb.jpg")
48 | # ,config="-psm 7 digits"
49 | img = Image.open(image_path)
50 | # thumb = img.crop((10,42,160,150))
51 | # thumb = img.crop((40, 30, 100, 70))
52 | thumb = img.crop((490, 0, 560, 60))
53 | thumb.save("thumb.jpg")
54 | return pytesseract.image_to_string(thumb, lang,config="-psm 7 digits")
55 |
56 | def file_path_scan(file_path):
57 | for filename in os.listdir(file_path):
58 | path = os.path.join(file_path, filename)
59 | if not os.path.isfile(path):
60 | continue
61 | title = img_to_str(path, lang=Languages.CHS)
62 | print(title)
63 | try:
64 | play_count = re.findall("\d+",title)[0]
65 | #print(play_count)
66 | except:
67 | #print(title)
68 | play_count= 0
69 | yield filename,play_count
70 |
71 |
72 | file_path = r'D:\work_file\word_file_new\litao\num'
73 | for filename,play_count in file_path_scan(file_path):
74 | time_str = filename.replace(".png","")
75 | time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:]
76 | # print(time_str)
77 | print(time_str,play_count)
78 |
79 | # print(img_to_str(r'D:\work_file\word_file_new\litao\screen\2020-04-16 202632.png', lang=Languages.CHS))
--------------------------------------------------------------------------------
/crawler_sys/tools/tudou_selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import datetime
3 | import re
4 | import pandas as pd
5 |
6 |
7 | class Craler_tudou(object):
8 | def __init__(self):
9 | chrome_options = webdriver.ChromeOptions()
10 | prefs = {"profile.managed_default_content_settings.images": 2}
11 | chrome_options.add_experimental_option("prefs", prefs)
12 | self.driver = webdriver.Chrome(chrome_options=chrome_options)
13 |
14 |
15 | @staticmethod
16 | def video_time(time_str):
17 | now = datetime.datetime.now()
18 | if "分钟前" in time_str:
19 | min_str = re.findall(r"(\d+)分钟前",time_str)[0]
20 | videotime = now - datetime.timedelta(minutes=int(min_str))
21 | elif "小时前" in time_str:
22 | hour_str = re.findall(r"(\d+)小时前", time_str)[0]
23 | videotime = now - datetime.timedelta(hours=int(hour_str))
24 | elif "昨天" in time_str:
25 | date_lis = time_str.split(" ")
26 | hours, mins = date_lis[1].split(":")
27 | last_day = now - datetime.timedelta(days=1)
28 | videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
29 | elif "前天" in time_str:
30 | date_lis = time_str.split(" ")
31 | hours, mins = date_lis[1].split(":")
32 | last_day = now - datetime.timedelta(days=2)
33 | videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
34 | elif "天前" in time_str:
35 | day_str = re.findall(r"(\d+)天前", time_str)[0]
36 | videotime = now - datetime.timedelta(days=int(day_str))
37 | elif "刚刚" in time_str:
38 | videotime = now
39 | else:
40 | if str(now.year) in time_str:
41 | pass
42 | else:
43 | date_lis = time_str.split(" ")
44 | month,days = date_lis[0].split("-")
45 | hours,mins = date_lis[1].split(":")
46 | videotime = datetime.datetime(year=int(now.year),month=int(month),day=int(days),hour=int(hours),minute=int(mins))
47 |
48 | # print(videotime.strftime("%Y-%m-%d %H:%M:%S"))
49 | return videotime
50 |
51 | def time_range_video_num(self,start_time,end_time,url_list):
52 | data_lis = []
53 | info_lis = []
54 | columns = [""]
55 | for dic in url_list:
56 | for res in self.get_page(dic["url"]):
57 | title,link,video_time = res
58 | print(res)
59 | if start_time < video_time < end_time:
60 | data_lis.append((title,link,video_time,dic["url"]))
61 | else:
62 | break
63 | csv_save = pd.DataFrame(data_lis)
64 | csv_save.to_csv("%s.csv" % (dic["platform"] + "_" + dic["releaser"]),encoding="GBK")
65 | info_lis.append([dic["platform"],dic["releaser"],len(data_lis)])
66 | data_lis = []
67 | csv_save = pd.DataFrame(info_lis)
68 | csv_save.to_csv("%s.csv" % (datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")), encoding="GBK")
69 | self.driver.quit()
70 |
71 | def get_page(self,url):
72 | #video_page = self.driver.get(url)
73 | # js = 'window.open("%s");' % url
74 | # self.driver.execute_script(js)
75 | page_num = -1
76 | try:
77 | video_page = self.driver.get(url)
78 | while True:
79 | page_num += 1
80 | if page_num != 0:
81 | self.driver.find_element_by_class_name("next").click()
82 | video_lis = self.driver.find_elements_by_xpath("/html/body/div[2]/div/div[3]/div/div/div/div[2]/div/div/div/div[1]/div")
83 | for v in video_lis:
84 | v_a = v.find_element_by_xpath("./div[2]/a")
85 | title = v_a.get_attribute("title")
86 | link = v_a.get_attribute("href")
87 | video_time = self.video_time(v.find_element_by_class_name("v-publishtime").text)
88 |
89 | yield (title,link,video_time)
90 |
91 | except Exception as e:
92 | raise e
93 | print(e)
94 | print("page %s has no more data" % page_num)
95 |
96 | if __name__ == "__main__":
97 | test = Craler_tudou()
98 | url_lis = [
99 | {"platform":"new_tudou",
100 | "url":"https://id.tudou.com/i/UNTk2NjE0MDM4NA==/videos?",
101 | "releaser":"酷娱文化先锋"
102 | },
103 | {"platform": "new_tudou",
104 | "url": "https://id.tudou.com/i/UMTQ3MDM0MjAw/videos?",
105 | "releaser": "酷娱文化先锋"
106 | }]
107 | start_time = datetime.datetime(year=2019,month=6,day=6)
108 | end = datetime.datetime.now()
109 | test.time_range_video_num(start_time,end,url_lis)
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count.rar
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/.idea/video_num_count.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/2019-09-10.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/2019-09-10.csv
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/count.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/count.csv
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/count_main.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/11 15:26
3 | # @Author : litao
4 | import datetime
5 |
6 | from crawler.haokan_count import *
7 | from crawler.qq_video_count import *
8 | from crawler.toutiao_count import *
9 | from crawler.tudou_count import *
10 | from crawler.crawler_wangyi_news import *
11 | from crawler.crawler_tencent_news import *
12 | from concurrent.futures import ProcessPoolExecutor
13 |
14 | craler_site = {
15 | "haokan": Craler_haokan(),
16 | "toutiao": Craler_toutiao(),
17 | "腾讯视频": Craler_qq(),
18 | "new_tudou": Craler_tudou(),
19 | "网易新闻": Crawler_wangyi_news(),
20 | "腾讯新闻": Crawler_Tencent_News()
21 | }
22 |
23 |
24 | def start_count(releaser, platform, releaserUrl, re_s_t, re_e_t):
25 | craler = craler_site.get(platform)
26 | if craler:
27 | url_lis = [
28 | {
29 | "platform": platform,
30 | "url": releaserUrl,
31 | "releaser": releaser
32 | }
33 | ]
34 | craler.time_range_video_num(re_s_t, re_e_t, url_lis)
35 |
36 |
37 | if __name__ == "__main__":
38 | miaopai_list = []
39 | platform_dic = {
40 | "haokan": [],
41 | "toutiao": [],
42 | "腾讯视频": [],
43 | "new_tudou": [],
44 | "网易新闻": [],
45 | "腾讯新闻":[]
46 | }
47 | file = r'count.csv'
48 | #file = r'D:\wxfile\WeChat Files\litaolemo\FileStorage\File\2019-07\count(3).csv'
49 | now = int(datetime.datetime.now().timestamp() * 1e3)
50 | executor = ProcessPoolExecutor(max_workers=6)
51 | futures = []
52 | with open(file, 'r', encoding="gb18030")as f:
53 | header_Lst = f.readline().strip().split(',')
54 | for line in f:
55 | line_Lst = line.strip().split(',')
56 | line_dict = dict(zip(header_Lst, line_Lst))
57 | releaser = line_dict['releaser']
58 | platform = line_dict['platform']
59 | releaserUrl = line_dict['releaserUrl']
60 | re_s_t = line_dict['开始时间']
61 | re_e_t = line_dict['结束时间']
62 | start_time_lis = re_s_t.split("/")
63 | end_time_lis = re_e_t.split("/")
64 | start_time_stamp = int(datetime.datetime(year=int(start_time_lis[0]), month=int(start_time_lis[1]),
65 | day=int(start_time_lis[2])).timestamp() * 1e3)
66 | end_time__stamp = int(datetime.datetime(year=int(end_time_lis[0]), month=int(end_time_lis[1]),
67 | day=int(end_time_lis[2])).timestamp() * 1e3)
68 | # future = executor.submit(start_count, releaser, platform, releaserUrl, start_time_stamp, end_time__stamp)
69 | # futures.append(future)
70 | start_count(releaser, platform, releaserUrl, start_time_stamp, end_time__stamp)
71 | executor.shutdown(True)
72 | print('+++>完成')
73 | # for future in futures:
74 | # print(future.result())
75 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/2019-09-10.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/2019-09-10.csv
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/11 15:59
3 | # @Author : litao
4 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/trans_duration_str_to_second.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Sep 10 15:04:04 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | def trans_duration(duration_str):
9 | """suitable for 20:20, 20:20:10"""
10 | duration_lst = duration_str.split(':')
11 | if len(duration_lst) == 3:
12 | duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2]))
13 | return duration
14 | elif len(duration_lst) == 2:
15 | duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1]))
16 | return duration
17 |
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/crawler/首次运行.bat:
--------------------------------------------------------------------------------
1 | python -m pip install --upgrade pip
2 | python -m pip install xlutils
3 | python -m pip install requests
4 | python -m pip install bs4
5 | python -m pip install pandas
--------------------------------------------------------------------------------
/crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv
--------------------------------------------------------------------------------
/crawler_sys/tools/write_feihua_json_to_es.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import datetime
4 | import elasticsearch
5 | from write_data_into_es.func_cal_doc_id import *
6 | hosts = '192.168.17.11'
7 | port = 80
8 | user = 'zhouyujiang'
9 | passwd = '8tM9JDN2LVxM'
10 | http_auth = (user, passwd)
11 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
12 | from write_data_into_es.func_get_releaser_id import get_releaser_id
13 |
14 | def dic():
15 | url_data = []
16 | format_data = {}
17 | f = open("E:\M_2019-05-22_U_3", "r")
18 | for i in f.readlines():
19 | url_data.append(json.loads(i))
20 | for i in url_data:
21 | format_data[i["url"]] = ""
22 | print(len(format_data))
23 |
24 | def get_data():
25 | url_data = []
26 | format_data = []
27 | # url = "https://enlightent-backup.oss-cn-beijing.aliyuncs.com/csm/20190318-20190324/csm_douyin_20190318_20190324.json?Expires=1554243367&OSSAccessKeyId=TMP.AQG2JUd3g4Gv66npoCNJPVnH-r9yRqhMGwqJtilxiBCDsbRJJ4kTuiE_T17CMC4CFQC8gXq7WHE73SSE9s2DjpWzF7Y2TwIVAIeJz9r0QHkaPi8FGyzN1TXmsjvn&Signature=XsHnMu%2B4agHS6Z6tq%2B55WWaZjDk%3D"
28 | # res = requests.get(url)
29 | # with open("./url_json.json","w") as f:
30 | # f.write(res.text)
31 |
32 | # f = open("E:\M_2019-05-22_U_3", "r")
33 | # for i in f.readlines():
34 | # url_data.append(json.loads(i))
35 | for i in url_data:
36 | print(i)
37 | format_data.append(
38 | {
39 | "platform": i["platform"],
40 | "duration": i["duration"],
41 | "favorite_count": i["favorite"],
42 | "fetch_time": int(i["crawledtime"])*1000,
43 | "play_count": i["playtimes"],
44 | "release_time": i["releasetime"],
45 | "releaser": i["releaser"],
46 | "title": i["title"],
47 | "url": i["url"],
48 | "comment_count": i["commentnum"],
49 | "dislike_count": 0,
50 | "isOriginal": False,
51 | "releaserUrl": i["releaserurl"],
52 | "repost_count": 0,
53 | "timestamp": int(datetime.datetime.timestamp(datetime.datetime.now()))*1000,
54 | "data_provider": "fhtech",
55 | "channel": i["channel"],
56 | "releaser_id_str":"miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=i["releaserurl"])
57 | }
58 | )
59 | return format_data
60 | # target_date_list = target_type.split('-')
61 | # target_date_start = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]), 1)
62 | # target_date_end = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]) + 1, 1)
63 | # target_ts_start = int(target_date_start.timestamp()) * 1000
64 | # target_ts_end = int(target_date_end.timestamp()) * 1000
65 | # print(target_ts_start)
66 | # print(target_ts_end)
67 |
68 |
69 | def write_es(file):
70 | count = 0
71 | bulk_all_body = ""
72 | doc_id_type = "all-time-url"
73 | for i in file:
74 | #print(i)
75 | # format_i = {}
76 | # a = "format_i = %s" % i
77 | # exec(a,format_i)
78 | format_i = json.loads(i)
79 | # format_i = format_i["format_i"]
80 | # print(format_i)
81 | try:
82 | _id = cal_doc_id(platform=format_i["platform"], url=format_i["url"], doc_id_type=doc_id_type,data_dict=format_i)
83 | format_i["timestamp"] = int(datetime.datetime.now().timestamp()*1e3)
84 | if len(str(format_i["release_time"])) != 13:
85 | print(format_i["release_time"])
86 | format_i["release_time"] = int(format_i["release_time"] / 1000)
87 | format_i["releaser_id_str"] = "miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=format_i["releaserUrl"])
88 | bulk_head = '{"index": {"_id":"%s"}}' % _id
89 | except Exception as e:
90 | print(e)
91 | continue
92 | # find_exist = {
93 | # "query": {
94 | # "bool": {
95 | # "filter": [
96 | # {"term": {"_id":_id }}
97 | # ]
98 | # }
99 | # }
100 | # }
101 | # search_re = es.search(index=target_index, doc_type=target_type,
102 | # body=find_exist)
103 | # if search_re['hits']['total'] == 1:
104 | # if counti % 1000 == 0:
105 | # print("done ", counti,"\n")
106 | # return None
107 | # else:
108 | # pass
109 |
110 | data_str = json.dumps(format_i, ensure_ascii=False)
111 | bulk_one_body = bulk_head + '\n' + data_str + '\n'
112 | bulk_all_body += bulk_one_body
113 | count += 1
114 | print("find",count)
115 | if count % 1000 == 0:
116 | eror_dic = es.bulk(index=target_index, doc_type=target_type,
117 | body=bulk_all_body, request_timeout=200)
118 | bulk_all_body = ''
119 | if eror_dic['errors'] is True:
120 | print(eror_dic['items'])
121 | print(bulk_all_body)
122 | print(count)
123 |
124 | if bulk_all_body != '':
125 | eror_dic = es.bulk(body=bulk_all_body,
126 | index=target_index,
127 | doc_type=target_type,
128 | request_timeout=200)
129 | if eror_dic['errors'] is True:
130 | print(eror_dic)
131 | bulk_all_body = ''
132 | #print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
133 |
134 |
135 | if __name__ == '__main__':
136 | target_index = 'short-video-all-time-url'
137 | target_type = 'all-time-url'
138 | m3 = open(r"C:\Users\litao\Desktop\csv\202002\M_2020-02-04_U_3", "r", encoding="utf-8")
139 | # f = open("exists", "a+")
140 | write_es(m3)
141 |
--------------------------------------------------------------------------------
/crawler_sys/tools/头条重复数据导出.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Feb 14 16:12:57 2019
4 |
5 | @author: zhouyujiang
6 |
7 | 查找切片中头条发布者+发布时间+duration相同的数据
8 | """
9 |
10 | import pandas as pd
11 | import datetime
12 | import elasticsearch
13 | from elasticsearch.helpers import scan
14 | from crawler_url_video_info import get_target_video_info
15 | hosts='192.168.17.11'
16 | port=80
17 | user='zhouyujiang'
18 | passwd='8tM9JDN2LVxM'
19 | http_auth=(user, passwd)
20 | es=elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
21 |
22 |
23 |
24 | zyj_set = set()
25 | zyj_dict = {}
26 | index = 'short-video-weekly'
27 | doc_type = 'daily-url-2019_w07_s1'
28 | re_s_t = 1549728000000
29 | re_e_t = 1550332800000
30 | count = 0
31 | sacn_body = {
32 | "query": {
33 | "bool": {
34 | "filter": [
35 | {"term": {"platform.keyword": 'toutiao'}},
36 | {"range": {"release_time": {"gte": re_s_t,"lt":re_e_t}}}
37 | ]
38 | }
39 | }
40 | }
41 | scan_re = scan(client=es, index=index, doc_type=doc_type,
42 | query=sacn_body, scroll='3m')
43 | for one in scan_re:
44 | count = count +1
45 | if count %1000 == 0:
46 | print(count)
47 | line = one['_source']
48 | releaser = line['releaser']
49 | release_time = line['release_time']
50 | duration = line['duration']
51 | zyj_id = releaser + str(release_time) + str(duration)
52 | if zyj_id not in zyj_dict:
53 | zyj_dict[zyj_id] = []
54 | zyj_dict[zyj_id].append(line)
55 | else:
56 | zyj_set.add(zyj_id)
57 | zyj_dict[zyj_id].append(line)
58 | re_list = []
59 | for one_key in zyj_set:
60 | for one_value in zyj_dict[one_key]:
61 | # url = one_value['url']
62 | # new_playcount = get_target_video_info(url=url, platform='toutiao')
63 | # one_value['new_playcount'] = new_playcount
64 | re_list.append(one_value)
65 |
66 |
67 |
68 |
69 |
70 | data = pd.DataFrame(re_list)
71 | data.to_csv('头条7zhou重复数据重新抓取播放量.csv')
72 |
73 |
--------------------------------------------------------------------------------
/crawler_sys/utils/__init__ .py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue June 6 18:02:43 2018
4 |
5 | @author: hanye
6 | """
7 |
8 |
--------------------------------------------------------------------------------
/crawler_sys/utils/connect_with_es.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Jun 6 18:18:09 2018
4 |
5 | @author: hanye
6 | """
7 | #import redis
8 | #from crawler_sys.framework.platform_redis_register import get_redis_list_name
9 | from crawler.crawler_sys.framework.es_crawler import scan_crawler_url_register
10 |
11 | #rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0)
12 |
13 | def pull_url_from_es(platform, release_time_lower_bdr=None):
14 | """
15 | Just pull urls from es index crawler-url-register.
16 | Url reforming things will be done in the method who
17 | is responsible for pushing urls into redis.
18 | Just return url and its platform
19 | """
20 | if release_time_lower_bdr is None:
21 | release_time_lower_bdr = 0
22 | else:
23 | pass
24 | search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
25 | {"gte": release_time_lower_bdr}}},
26 | {"term": {"platform.keyword": platform}}]}}}
27 | total_hit, scan_resp = scan_crawler_url_register(search_body)
28 | batch_url_Lst = []
29 | if total_hit > 0:
30 | line_counter = 0
31 | for line in scan_resp:
32 | line_counter += 1
33 | line_d = line['_source']
34 | url = line_d['url']
35 | batch_url_Lst.append(url)
36 | else:
37 | pass
38 | return batch_url_Lst
39 |
40 |
41 | #def url_reformer(platform, url):
42 | # """
43 | # to reform url according to platform, in the future.
44 | # Say, a url of http://www.toutiao.com/group/1234567890123456789
45 | # as a string is different from http://www.365yg.com/u/1234567890123456789,
46 | # but they point to the same resource. They should be reformed
47 | # to one unique url before pushing into redis for futher crawling.
48 | # """
49 | # reformed_url = url
50 | # return reformed_url
51 | #
52 | #def feed_url_into_redis(dict_Lst, platform,
53 | # release_time_lower_bdr=None,
54 | # batch_str=None):
55 | # """
56 | # release_time_lower_bdr must be an int value represent
57 | # timestamp in milliseconds if given.
58 | # All url that is released before release_time_lower_bdr
59 | # will not be pushed into redis. If argument release_time_lower_bdr
60 | # is not given when call this function, all urls will be
61 | # pushed into redis.
62 | # """
63 | # redis_list_name = get_redis_list_name(platform, batch_str)
64 | # if redis_list_name is None:
65 | # print('Failed to get correct redis list name '
66 | # 'in platform_redis_register for platform: '
67 | # % platform)
68 | # return (None, None)
69 | # else:
70 | # print('Feeding url into redis list %s ...' % redis_list_name)
71 | # url_counter = 0
72 | # for data_dict in dict_Lst:
73 | # try:
74 | # url = data_dict['url']
75 | # url_reformed = url_reformer(platform, url)
76 | # if release_time_lower_bdr is None:
77 | # sadd_c = rds.sadd(redis_list_name, url_reformed)
78 | # url_counter += sadd_c
79 | # else:
80 | # url_release_time = data_dict['release_time']
81 | # if url_release_time >= release_time_lower_bdr:
82 | # sadd_c = rds.sadd(redis_list_name, url_reformed)
83 | # url_counter += sadd_c
84 | # except:
85 | # print('Failed to push url into redis, '
86 | # 'might because of lack of url field '
87 | # 'or lack of release_time field, or '
88 | # 'has wrong typed release_time value. '
89 | # 'The failed data dict is: \n %s' % data_dict)
90 | # print('Pushed %d urls into redis' % url_counter)
91 | # return (redis_list_name, url_counter)
92 |
93 |
--------------------------------------------------------------------------------
/crawler_sys/utils/date_calculator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Nov 21 09:08:29 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import datetime
9 |
10 | def calculator(shifting_days=30,
11 | shifting_hours=0,
12 | shifting_minutes=0):
13 | now = datetime.datetime.now()
14 | if shifting_hours == 0 and shifting_minutes == 0 and shifting_days != 0:
15 | date_shift = now - datetime.timedelta(days=shifting_days)
16 | date_shift_str = str(date_shift)[:10]
17 | date_wanted = datetime.datetime.strptime(date_shift_str,
18 | "%Y-%m-%d").timestamp() * 1e3
19 | return int(date_wanted)
--------------------------------------------------------------------------------
/crawler_sys/utils/delete_repeat_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Sep 30 09:36:55 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import pymysql
10 | from crawler_sys.utils.write_into_database import write_lst_into_database
11 |
12 | connection = pymysql.connect(host='localhost',
13 | user='root',
14 | passwd='goalkeeper@1',
15 | db='proxy_pool',
16 | port=3306,
17 | cursorclass=pymysql.cursors.DictCursor)
18 | cursor = connection.cursor()
19 |
20 | search_sql = "select * from proxy_pool"
21 | cursor.execute(search_sql)
22 |
23 | test_lst = cursor.fetchall()
24 |
25 | new_lst = []
26 | ip_lst = []
27 | for line in test_lst:
28 | if line['ip_address'] not in ip_lst:
29 | new_lst.append(line)
30 | ip_lst.append(line['ip_address'])
31 |
32 | delect_sql = "delete from proxy_pool where id >= 1"
33 | cursor.execute(delect_sql)
34 | connection.commit()
35 |
36 | write_lst_into_database(data_lst=new_lst,
37 | table_name='proxy_pool',
38 | host='localhost',
39 | passwd='goalkeeper@1')
--------------------------------------------------------------------------------
/crawler_sys/utils/extract_data_from_es.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Oct 8 17:14:16 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import elasticsearch
9 |
10 |
11 | hosts = '192.168.17.11'
12 | port = 80
13 | user_id = 'fangyucheng'
14 | password = 'VK0FkWf1fV8f'
15 | http_auth = (user_id, password)
16 | lose_re_url = []
17 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
18 |
19 | search_body = {"query": {
20 | "bool": {
21 | "filter": [
22 | {"term": {"platform.keyword": "new_tudou"}},
23 | {"term": {"post_by.keyword": "zhangqiongzi"}}
24 | ]
25 | }
26 | }
27 | }
28 |
29 |
30 | get_tr = es.search(index='target_releasers', body=search_body, size=200)
31 |
32 | result_lst = []
33 |
34 | for line in get_tr['hits']['hits']:
35 | result_lst.append(line['_source'])
--------------------------------------------------------------------------------
/crawler_sys/utils/fakechrome.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Aug 16 17:12:37 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import random
9 |
10 | first_num = random.randint(55, 62)
11 | third_num = random.randint(0, 3200)
12 | fourth_num = random.randint(0, 140)
13 |
14 | class Fakechrome:
15 | os_type = ['(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
16 | '(Macintosh; Intel Mac OS X 10_12_6)']
17 |
18 | chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
19 |
20 | @classmethod
21 | def get_ua(cls):
22 | return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36',
23 | '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36'])
24 |
25 | headers = {'User-Agent': Fakechrome.get_ua(),
26 | 'Accept-Encoding': 'gzip, deflate, sdch',
27 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 | 'Connection': 'keep-alive'}
30 |
--------------------------------------------------------------------------------
/crawler_sys/utils/fix_album_play_count_by_releaser_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 23 14:44:07 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | from elasticsearch import Elasticsearch
10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
11 |
12 |
13 | hosts = '192.168.17.11'
14 | port = 80
15 | user_id = 'fangyucheng'
16 | password = 'VK0FkWf1fV8f'
17 | http_auth = (user_id, password)
18 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
19 |
20 | unsolve2_lst = []
21 | result_lst2 =[]
22 |
23 | for url in unsolve_lst:
24 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
25 | search = es_connection.search(index="test2", doc_type="fyc1123", body=search_body)
26 | if search["hits"]["total"] == 0:
27 | unsolve2_lst.append(url)
28 | print("can not get video data at %s" % url)
29 | else:
30 | video_data = search["hits"]["hits"][0]["_source"]
31 | result_lst2.append(video_data)
32 | print("get playcount at %s" % url)
33 |
34 | lst_to_csv(listname=result_lst2,
35 | csvname="F:/add_target_releaser/Nov/get_playcount_by_releaser2.csv")
--------------------------------------------------------------------------------
/crawler_sys/utils/get_toutiao_as_cp_signature.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed May 23 12:37:48 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import requests
9 | # import js2py
10 | import hashlib
11 |
12 |
13 | def as_cp(user_id, max_behot_time):
14 | as_cp = []
15 | t = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3)')
16 | i = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3),i = t.toString(16).toUpperCase()')
17 | e = hashlib.md5(str(t).encode('utf-8')).hexdigest()
18 | if len(i) != 8:
19 | var_as = "479BB4B7254C150"
20 | cp = "7E0AC8874BB0985"
21 | else:
22 | e = e.upper()
23 | s = e[0:5]
24 | o = e[-5:]
25 |
26 | n = ''
27 | a = 0
28 | while a < 5:
29 | n = n + s[a] + i[a]
30 | a += 1
31 |
32 | l = ''
33 | r = 0
34 | while r < 5:
35 | l = l + i[r+3] + o[r]
36 | r = r + 1
37 | var_as = 'A1' + n + i[-3:]
38 | cp = i[0:3] + l + 'E1'
39 |
40 | as_cp.append(var_as)
41 | as_cp.append(cp)
42 | return as_cp
43 |
44 |
45 | def signature(user_id, max_behot_time):
46 | jsurl = 'https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_f62209a.js'
47 | get_page = requests.get(jsurl)
48 | get_page.encoding = 'utf-8'
49 | page = get_page.text
50 | effect_js = page.split('Function')
51 | js_1 = ('var navigator = {"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) '
52 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"};')
53 | js_2 = 'Function' + effect_js[3]
54 | js_3 = 'Function' + effect_js[4]
55 | js_4 = ';function result(){ return TAC.sign('+user_id+''+max_behot_time+');} result();'
56 | js_total = js_1+js_2+js_3+js_4
57 | signature = js2py.eval_js(js_total)
58 | return signature
59 |
--------------------------------------------------------------------------------
/crawler_sys/utils/output_log.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Dec 19 12:54:20 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import logging
9 | formatter = logging.Formatter('%(asctime)s %(name)s %(filename)s '
10 | '%(funcName)s %(levelname)s %(message)s')
11 |
12 | def init_logger(name, log_file, level=logging.INFO):
13 | """initialize logger"""
14 | #output log to file
15 | handler = logging.FileHandler(log_file)
16 | handler.setFormatter(formatter)
17 | #output log to screen
18 | console = logging.StreamHandler()
19 | console.setFormatter(formatter)
20 | #initialize logger
21 | logger = logging.getLogger(name)
22 | logger.setLevel(level)
23 | #add handler and console to logger
24 | logger.addHandler(handler)
25 | logger.addHandler(console)
26 | return logger
--------------------------------------------------------------------------------
/crawler_sys/utils/parse_bool_for_args.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jun 29 12:46:19 2018
4 |
5 | @author: hanye
6 | """
7 |
8 | def parse_bool_for_args(arg_str):
9 | if arg_str.lower() in ('true', 'yes', 'y', '1'):
10 | return True
11 | elif arg_str.lower() in ('false', 'no', 'n', '0'):
12 | return False
13 | else:
14 | print('Illegal input! Bool like string values are needed.')
15 | return None
16 |
--------------------------------------------------------------------------------
/crawler_sys/utils/releaser_url_check.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Aug 6 09:42:41 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | from crawler_sys.framework import platform_crawler_register
10 |
11 |
12 | def test_releaserUrl(test_lst):
13 | for line in test_lst:
14 | try:
15 | platform = line['platform']
16 | platform_crawler = platform_crawler_register.get_crawler(platform)
17 | releaserUrl = line['releaserUrl']
18 | try:
19 | platform_crawler().releaser_page(releaserUrl=releaserUrl,
20 | releaser_page_num_max=1)
21 | line['True_or_False'] = 1
22 | line['add_mess'] = 'correct'
23 | print('get releaser page')
24 | print(line)
25 | yield line
26 | except:
27 | line['True_or_False'] = 0
28 | line['add_mess'] = 'wrong_url'
29 | print('%s can not get vaild info' % releaserUrl)
30 | except:
31 | pass
32 |
--------------------------------------------------------------------------------
/crawler_sys/utils/remove_unrelated_parameter_tudou.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Nov 26 13:14:03 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | import urllib
10 | import requests
11 |
12 | domain = 'http://www.tudou.com/api/getfeeds?'
13 | #domain_simple = 'http://r.inews.qq.com/searchMore'
14 | #headers = {"Host": "r.inews.qq.com",
15 | # "Accept-Encoding": "gzip,deflate",
16 | # "Referer": "http://inews.qq.com/inews/android/",
17 | # "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)",
18 | # "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;",
19 | # "Connection": "Keep-Alive"}
20 |
21 | url_dic = {'apptype': '27',
22 | 'pg': '8',
23 | 'module': '10',
24 | 'feedtype': '2',
25 | 'guid': '15010560241384qU',
26 | 'uid': '-1',
27 | 'secCateId': '10016',
28 | 'max_offset': '-1',
29 | 'min_offset': '-1',
30 | 'utdid': 'T8v9EQPOimUCAXL/Az0YrDOB',
31 | 'page_size': '24'}
32 |
33 | #sig = 'c022edce8ae72f053304412f13a9bb88'
34 |
35 | url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic)
36 |
37 | """
38 | raw-url
39 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862'
40 | """
41 |
42 | get_page = requests.get(url)
43 | page = get_page.text
44 |
45 | key_lst = []
46 | para_lst = []
47 |
48 | for key, value in url_dic.items():
49 | key_lst.append(key)
50 |
51 | for key in key_lst:
52 | value = url_dic[key]
53 | url_dic.pop(key)
54 | url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic)
55 | get_page = requests.get(url)
56 | page = get_page.text
57 | time.sleep(5)
58 | if len(page) > 15000:
59 | print("%s can be moved from url, length of page is %s" % (key, len(page)))
60 | continue
61 | else:
62 | url_dic[key] = value
63 | print("key %s, value %s can't be moved from url" % (key, value))
64 |
65 | #this the result
66 | final_url_dic = {'devid': '008796749793280',
67 | 'appver': '23_android_5.4.10',
68 | 'query': 'espn',
69 | 'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47',
70 | 'qn-sig': 'c022edce8ae72f053304412f13a9bb88'}
71 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic)
72 |
73 | """
74 | final_url
75 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88'
76 | """
77 |
78 |
--------------------------------------------------------------------------------
/crawler_sys/utils/trans_duration_str_to_second.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Sep 10 15:04:04 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | def trans_duration(duration_str):
9 | """suitable for 20:20, 20:20:10"""
10 | if type(duration_str) == int:
11 | return duration_str
12 | duration_lst = duration_str.split(':')
13 | if len(duration_lst) == 3:
14 | duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2]))
15 | return duration
16 | elif len(duration_lst) == 2:
17 | duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1]))
18 | return duration
19 | else:
20 | return duration_lst[0]
21 |
--------------------------------------------------------------------------------
/crawler_sys/utils/trans_format.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon May 14 13:35:26 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import json
10 | import pandas as pd
11 |
12 |
13 | def dic_file_to_lst(filename):
14 | openfile = open(filename)
15 | task = []
16 | for line in openfile:
17 | line_dic = json.loads(line)
18 | task.append(line_dic)
19 | return task
20 |
21 |
22 | def str_file_to_lst(filename):
23 | openfile = open(filename, 'r', encoding='utf-8')
24 | task = []
25 | for line in openfile:
26 | line_str = line.replace('\n','')
27 | task.append(line_str)
28 | return task
29 |
30 |
31 | def csv_to_lst(csvname):
32 | openfile = open(csvname, 'r')
33 | task = []
34 | for line in openfile:
35 | task.append(line)
36 | task_lst = []
37 | for line in task:
38 | line_lst = line.split(',')
39 | task_lst.append(line_lst)
40 | return task_lst
41 |
42 |
43 | def csv_to_lst_with_headline(csvname):
44 | result_lst = []
45 | openfile = open(csvname, 'r', encoding='gb18030')
46 | head = openfile.readline()
47 | head = head.replace('\n', '')
48 | head_lst = head.strip().split(',')
49 | for line in openfile:
50 | line = line.replace('\n', '')
51 | line_lst = line.strip().split(',')
52 | test_dict = dict(zip(head_lst,line_lst))
53 | result_lst.append(test_dict)
54 | return result_lst
55 |
56 |
57 | def dic_lst_to_file(listname, filename):
58 | file = open(filename, 'a')
59 | for line in listname:
60 | json_line = json.dumps(line)
61 | file.write(json_line)
62 | file.write('\n')
63 | file.flush()
64 | file.close()
65 |
66 |
67 | def str_lst_to_file(listname, filename):
68 | file = open(filename, 'a', encoding='gb18030')
69 | for line in listname:
70 | file.write(line)
71 | file.write('\n')
72 | file.flush()
73 | file.close()
74 |
75 |
76 | def lst_to_csv_practical(lst_name, csv_name):
77 | dataframe=pd.DataFrame(lst_name)
78 | if len(str(lst_name[0]['release_time'])) == 10:
79 | dataframe['midstep'] = dataframe['release_time']+8*3600
80 | dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='s')
81 | else:
82 | dataframe['midstep'] = dataframe['release_time']+8*3600*1e3
83 | dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='ms')
84 | del dataframe['midstep']
85 | del dataframe['release_time']
86 | del dataframe['fetch_time']
87 | dataframe.to_csv(csv_name, encoding='gb18030', index=False)
88 |
89 |
90 | def lst_to_csv(listname, csvname):
91 | dataframe = pd.DataFrame(listname)
92 | dataframe.to_csv(csvname, encoding='gb18030', index=False)
93 |
94 |
95 | def csv_to_file(filename, csvname):
96 | file = open(filename)
97 | task = []
98 | for line in task:
99 | json_line = json.dumps(line)
100 | file.write(json_line)
101 | file.write('\n')
102 | file.flush()
103 | dataframe = pd.DataFrame(task)
104 | dataframe.to_csv(csvname, encoding='gb18030', index=False)
105 | file.close()
106 |
--------------------------------------------------------------------------------
/crawler_sys/utils/trans_str_play_count_to_int.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Sep 10 14:41:54 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | def trans_play_count(play_count_str):
9 | """suitable for the format 22万, 22万次播放, 22.2万, 2,222万, 2,222.2万, 2,222, 222"""
10 | if isinstance(play_count_str,int):
11 | return play_count_str
12 |
13 | play_count_str = play_count_str.replace('次播放', '')
14 | play_count_str = play_count_str.replace('播放', '')
15 | try:
16 | if '万' in play_count_str:
17 | play_count_str = play_count_str.split('万')[0]
18 | if ',' in play_count_str:
19 | play_count_str = play_count_str.replace(',', '')
20 | play_count = int(float(play_count_str) * 1e4)
21 | return play_count
22 | elif "w" in play_count_str:
23 | play_count_str = play_count_str.split('w')[0]
24 | if ',' in play_count_str:
25 | play_count_str = play_count_str.replace(',', '')
26 | play_count = int(float(play_count_str) * 1e4)
27 | return play_count
28 | else:
29 | try:
30 | play_count = int(play_count_str)
31 | except:
32 | play_count = int(play_count_str.replace(',', ''))
33 | return play_count
34 | except:
35 | return None
--------------------------------------------------------------------------------
/crawler_sys/utils/util_logging.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Wed Mar 20 10:07:00 2019
5 |
6 | @author: hanye
7 | """
8 | import datetime
9 | import logging
10 | import logging.handlers
11 | from functools import wraps
12 |
13 |
14 | def logged(func):
15 | """
16 | Decorator to log crawler task.
17 | """
18 | @wraps(func)
19 | def with_logging(*args, **kwargs):
20 | today_str = datetime.datetime.now().isoformat()[:10]
21 | log_folder_name = 'crawler_log'
22 | crawler_pth = '/home/hanye/crawlersNew/crawler'
23 | LOG_FN = ('crawler_task_%s_%s_log' % (func.__name__, today_str))
24 | log_fn_abs_path = '/'.join([crawler_pth, log_folder_name, LOG_FN])
25 | FORMAT = '[%(asctime)s][runningFunction:%(name)s][logModule:%(module)s][pid:%(process)d] %(message)s'
26 | hy_logger = logging.getLogger(func.__name__)
27 | hy_logger.setLevel(logging.INFO)
28 | formatter = logging.Formatter(fmt=FORMAT)
29 | file_handler = logging.FileHandler(filename=log_fn_abs_path)
30 | file_handler.setFormatter(formatter)
31 | file_handler.setLevel(logging.INFO)
32 | hy_logger.addHandler(file_handler)
33 |
34 | hy_logger.info('task starts')
35 | argstr = ''
36 | if args:
37 | argstr += args.__str__()
38 | if kwargs:
39 | argstr += kwargs.__str__()
40 | if argstr:
41 | hy_logger.info('args:%s' % argstr)
42 |
43 | return func(*args, **kwargs)
44 | return with_logging
45 |
46 |
47 |
--------------------------------------------------------------------------------
/crawler_sys/utils/write_into_database.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Sep 30 10:45:11 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 |
10 | import pymysql
11 | from crawler_sys.proxy_pool import make_up_sql
12 |
13 |
14 | def write_dic_into_database(data_dic,
15 | host='192.168.18.11',
16 | user='root',
17 | passwd='csm@1234',
18 | database_name='proxy_pool',
19 | table_name='proxy_pool'):
20 | """
21 | write dict into mysql database
22 | """
23 |
24 | connection = pymysql.connect(host=host, user=user, passwd=passwd,
25 | db=database_name, port=3306,
26 | cursorclass=pymysql.cursors.DictCursor)
27 | cursor = connection.cursor()
28 |
29 | write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=data_dic)
30 | cursor.execute(write_into_sql)
31 | connection.commit()
32 | print('write %s into database' % data_dic['ip_address'])
33 |
34 |
35 | def write_lst_into_database(data_lst,
36 | host='192.168.18.11',
37 | user='root',
38 | passwd='csm@1234',
39 | database_name='proxy_pool',
40 | table_name='proxy_pool'):
41 | """
42 | write list into mysql database
43 | """
44 | connection = pymysql.connect(host=host, user=user, passwd=passwd,
45 | db=database_name, port=3306,
46 | cursorclass=pymysql.cursors.DictCursor)
47 | cursor = connection.cursor()
48 | length = len(data_lst)
49 |
50 | for line in data_lst:
51 | write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=line)
52 | cursor.execute(write_into_sql)
53 |
54 | connection.commit()
55 | print('write %s pieces of data into database' % length)
--------------------------------------------------------------------------------
/crawler_sys/utils/write_into_file.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Oct 11 09:04:54 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 |
9 | import datetime
10 | import json
11 |
12 | current_date = datetime.datetime.now().isoformat()[:10]
13 |
14 | def write_str_into_file(file_path,
15 | file_name,
16 | var):
17 | with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file:
18 | file.write(var)
19 | file.write('\n')
20 |
21 | def write_dic_into_file(file_path,
22 | file_name,
23 | var):
24 | with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file:
25 | var_json = json.dumps(var)
26 | file.write(var_json)
27 | file.write('\n')
28 |
--------------------------------------------------------------------------------
/crawler_sys/utils/write_lsturl_info_conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Oct 10 17:06:49 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import os
9 | import configparser
10 |
11 | conf_file_path = os.getcwd()
12 | tencent_dic = {"platform": "腾讯视频",
13 | "channel": {"音乐": "http://v.qq.com/x/list/music",
14 | "新闻": "http://v.qq.com/x/list/news",
15 | "军事": "http://v.qq.com/x/list/military",
16 | "娱乐": "http://v.qq.com/x/list/ent",
17 | "体育": "http://v.qq.com/x/list/sports",
18 | "游戏": "http://v.qq.com/x/list/games",
19 | "搞笑": "http://v.qq.com/x/list/fun",
20 | "时尚": "http://v.qq.com/x/list/fashion",
21 | "生活": "http://v.qq.com/x/list/life",
22 | "母婴": "http://v.qq.com/x/list/baby",
23 | "汽车": "http://v.qq.com/x/list/auto",
24 | "科技": "http://v.qq.com/x/list/tech",
25 | "教育": "http://v.qq.com/x/list/education",
26 | "财经": "http://v.qq.com/x/list/finance",
27 | "房产": "http://v.qq.com/x/list/house",
28 | "旅游": "http://v.qq.com/x/list/travel",
29 | "王者荣耀": "http://v.qq.com/x/list/kings"}}
30 |
31 | target_lst_page_lst = [tencent_dic,]
32 |
33 | #initialize conf file
34 | config = configparser.ConfigParser()
35 | for platform_dic in target_lst_page_lst:
36 | config[platform_dic['platform']] = platform_dic['channel']
37 | with open(conf_file_path + '/lst_page_conf.ini',
38 | 'w', encoding='utf-8') as configfile:
39 | config.write(configfile)
40 |
--------------------------------------------------------------------------------
/crawler_sys/utils/write_releasers_into_ini_from_csv.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Oct 23 10:03:53 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import configparser
9 | from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline
10 |
11 | task_list = csv_to_lst_with_headline('F:/add_target_releaser/last_month/zhangminghui2.csv')
12 |
13 | releaser_dic = {}
14 | for line in task_list:
15 | releaser_dic[line['releaser']] = line['releaserUrl']
16 |
17 |
18 | config = configparser.ConfigParser()
19 | config['haokan'] = releaser_dic
20 |
21 | with open ('key_customer.ini', 'w', encoding='utf-8') as ini:
22 | config.write(ini)
23 |
24 |
25 |
26 | #special task
27 | #for line in source_lst:
28 | # detail_lst = line['detail']
29 | # csm_mdu = detail_lst[0]['csm_mdu']
30 | # for detail_dic in detail_lst:
31 | # detail_dic.pop('csm_mdu')
32 | # line['csm_mdu'] = csm_mdu
--------------------------------------------------------------------------------
/crawler_sys/utils/write_search_keyword_info_conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Oct 10 17:06:49 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import os
9 | import configparser
10 |
11 | conf_file_path = os.getcwd()
12 | #initialize conf file
13 | config = configparser.ConfigParser()
14 | config['腾讯新闻'] = {'keyword': '看看新闻,看看新闻Knews,Knews'}
15 | with open('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini',
16 | 'w', encoding='utf-8') as configfile:
17 | config.write(configfile)
18 | print(os.getcwd())
19 |
20 | config = configparser.ConfigParser()
21 | config.read('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini')
--------------------------------------------------------------------------------
/dev/CCTV_live/bilibili.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/28 16:18
3 | # @Author : litao
4 | # -*- coding:utf-8 -*-
5 | # @Time : 2020/5/28 15:19
6 | # @Author : litao
7 | from selenium import webdriver
8 | import redis, datetime,time,json
9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
11 |
12 | class CrawlerMain(object):
13 | def __init__(self):
14 | self.chrome_options = webdriver.ChromeOptions()
15 | # self.chrome_options.add_argument('--disable-gpu')
16 | self.chrome_options.add_argument("--start-maximized")
17 | self.chrome_options.add_argument("--no-sandbox")
18 | self.chrome_options.add_argument('disable-infobars')
19 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/
20 | self.driver = webdriver.Chrome(options=self.chrome_options)
21 |
22 | def login(self):
23 | self.driver.get("https://live.bilibili.com/21686237?from=search&seid=1739181021049557638")
24 | while True:
25 | now = datetime.datetime.now()
26 | res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']")
27 | if res:
28 | play_count = trans_play_count(res[0].text)
29 | print(play_count)
30 | dic = {
31 | "menber":play_count,
32 | "fetch_time":int(now.timestamp()*1e3)
33 | }
34 | rds.rpush("bilibili", json.dumps(dic))
35 | time.sleep(200)
36 |
37 | def __exit__(self):
38 | self.driver.close()
39 |
40 | if __name__ == "__main__":
41 | test = CrawlerMain()
42 | test.login()
43 |
--------------------------------------------------------------------------------
/dev/CCTV_live/toutiao.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/28 15:19
3 | # @Author : litao
4 | from selenium import webdriver
5 | import redis, datetime,time,json
6 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
7 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
8 |
9 | class CrawlerMain(object):
10 | def __init__(self):
11 | self.chrome_options = webdriver.ChromeOptions()
12 | # self.chrome_options.add_argument('--disable-gpu')
13 | self.chrome_options.add_argument("--start-maximized")
14 | self.chrome_options.add_argument("--no-sandbox")
15 | self.chrome_options.add_argument('disable-infobars')
16 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/
17 | self.driver = webdriver.Chrome(options=self.chrome_options)
18 |
19 | def login(self):
20 | self.driver.get("https://live.ixigua.com/room/6831736034540456716/")
21 | while True:
22 | now = datetime.datetime.now()
23 | res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']")
24 | if res:
25 | play_count = trans_play_count(res[0].text)
26 | print(play_count)
27 | dic = {
28 | "menber":play_count,
29 | "fetch_time":int(now.timestamp()*1e3)
30 | }
31 | rds.rpush("toutiao", json.dumps(dic))
32 | time.sleep(200)
33 |
34 | def __exit__(self):
35 | self.driver.close()
36 |
37 | if __name__ == "__main__":
38 | test = CrawlerMain()
39 | test.login()
40 |
--------------------------------------------------------------------------------
/dev/CCTV_live/yangshipin.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2020/5/28 16:21
3 | # @Author : litao
4 | # -*- coding:utf-8 -*-
5 | # @Time : 2020/5/28 15:19
6 | # @Author : litao
7 | from selenium import webdriver
8 | import redis, datetime,time,json
9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
11 |
12 | class CrawlerMain(object):
13 | def __init__(self):
14 | self.chrome_options = webdriver.ChromeOptions()
15 | # self.chrome_options.add_argument('--disable-gpu')
16 | self.chrome_options.add_argument("--start-maximized")
17 | self.chrome_options.add_argument("--no-sandbox")
18 | self.chrome_options.add_argument('disable-infobars')
19 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/
20 | self.driver = webdriver.Chrome(options=self.chrome_options)
21 |
22 | def login(self):
23 | self.driver.get("https://m.yangshipin.cn/video?type=2&vid=2004011401&pid=600036243&ptag=4_1.4.2.20898_wxf")
24 | while True:
25 | now = datetime.datetime.now()
26 | res = self.driver.find_elements_by_xpath("//span[@class='p-video-intro-person']")
27 | if res:
28 | play_count = trans_play_count(res[0].text)
29 | print(play_count)
30 | dic = {
31 | "menber":play_count,
32 | "fetch_time":int(now.timestamp()*1e3)
33 | }
34 | rds.rpush("toutiao", json.dumps(dic))
35 | time.sleep(200)
36 |
37 | def __exit__(self):
38 | self.driver.close()
39 |
40 | if __name__ == "__main__":
41 | test = CrawlerMain()
42 | test.login()
43 |
--------------------------------------------------------------------------------
/dev/iqiyi_debug/test_aiohttp.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec 7 11:02:17 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import aiohttp
9 | import asyncio
10 |
11 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html',
12 | 'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html',
13 | 'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html',
14 | 'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html',]
15 |
16 |
17 |
18 |
19 | async def download_page(session, url):
20 | get_page = await session.get(url)
21 | page = await get_page.text("utf-8", errors="ignore")
22 | return page
23 |
24 | async def get_list_page(loop):
25 | async with aiohttp.ClientSession() as list_page_sess:
26 | task = [loop.create_task(download_page(list_page_sess, url)) for url in task_list]
27 | done, pending = await asyncio.wait(task)
28 | result_lst = [d.result() for d in done]
29 | print(result_lst)
30 |
31 | loop = asyncio.get_event_loop()
32 | loop.run_until_complete(get_list_page(loop))
--------------------------------------------------------------------------------
/dev/iqiyi_debug/test_iqiyi_list_url.py:
--------------------------------------------------------------------------------
1 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html',
2 | 'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html',
3 | 'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html',
4 | 'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html',
5 | 'http://list.iqiyi.com/www/10/1007-------------4-1-2--1-.html',
6 | 'http://list.iqiyi.com/www/28/-------------4-13-2-iqiyi-1-.html',
7 | 'http://list.iqiyi.com/www/24/-------------4-23-2-iqiyi--.html',
8 | 'http://list.iqiyi.com/www/10/1006-------------4-27-2--1-.html',
9 | 'http://list.iqiyi.com/www/10/1007-------------4-6-2--1-.html',
10 | 'http://list.iqiyi.com/www/10/1006-------------4-4-2--1-.html',
11 | 'http://list.iqiyi.com/www/28/-------------4-14-2-iqiyi-1-.html',
12 | 'http://list.iqiyi.com/www/24/-------------4-5-2-iqiyi--.html',
13 | 'http://list.iqiyi.com/www/8/-------------4-13-2-iqiyi--.html',
14 | 'http://list.iqiyi.com/www/10/1007-------------4-30-2--1-.html',
15 | 'http://list.iqiyi.com/www/8/-------------4-8-2-iqiyi--.html',
16 | 'http://list.iqiyi.com/www/8/-------------4-16-2-iqiyi--.html',
17 | 'http://list.iqiyi.com/www/28/-------------4-9-2-iqiyi-1-.html',
18 | 'http://list.iqiyi.com/www/7/-------------4-23-2-iqiyi-1-.html',
19 | 'http://list.iqiyi.com/www/5/-------------4-10-2-iqiyi--.html',
20 | 'http://list.iqiyi.com/www/17/-------------4-4-2-iqiyi--.html',
21 | 'http://list.iqiyi.com/www/8/-------------4-4-2-iqiyi--.html',
22 | 'http://list.iqiyi.com/www/25/-------------4-3-2-iqiyi-1-.html',
23 | 'http://list.iqiyi.com/www/5/-------------4-18-2-iqiyi--.html',
24 | 'http://list.iqiyi.com/www/3/-------------4-24-2-iqiyi--.html',
25 | 'http://list.iqiyi.com/www/3/-------------4-19-2-iqiyi--.html',
26 | 'http://list.iqiyi.com/www/8/-------------4-21-2-iqiyi--.html',
27 | 'http://list.iqiyi.com/www/5/-------------4-9-2-iqiyi--.html',
28 | 'http://list.iqiyi.com/www/10/1006-------------4-28-2--1-.html',
29 | 'http://list.iqiyi.com/www/5/-------------4-27-2-iqiyi--.html',
30 | 'http://list.iqiyi.com/www/7/-------------4-26-2-iqiyi-1-.html']
31 |
32 |
--------------------------------------------------------------------------------
/dev/post_dict.txt:
--------------------------------------------------------------------------------
1 | {"param":"
2 | vPCnptKLNQAoHKbY9AUySUt3US0zNbhxzqZyO4IlZhCUZu0jMdEoQOSfkaDq
3 | bHw6e3k2kHR1VZB/FL+mJqHkZKYM812xpBR5ZUErEl0if5UxXIj51Ena6OAC
4 | LYnl+BHzISJEi0Ne7+2xXr/lUHZM4qDuJz8fd3xYXN2Xtwf32yAprr8c24GO
5 | IrLeM4e3fi9phnrfke34CMw8uYUX9wbujIFFG0Ln3pG+V2iPG3iBf654QVeF
6 | 3obzc2sOSvD1uCuP6J7jVQbLDphXxeQaXSKHqL5kdScU3QYDCnbYTMiJXq+A
7 | EMJ8/V4s+eKwRiWHvmtqcHLiTi81gQyJTQWY4D7v9RMe/RcUJDgvoFMNLBHt
8 | 4GQ+ow8QY5t6JLiVH7LsXIjhjF/gYRwn0unLBz0g4J+mGdAdh4+Go8EaJBKV
9 | 7fkFn/QaIGsyhyqfNn/fJc/btvCqgvdDeLphjU2WeFaA74DgD4ElDfNflh0T
10 | 50y0/nF8Vc5PQB1zA6R6ipuM5XkEuPLLZ4362p6T+2pm1p8XUF3GTGWteW+/
11 | gwLoPxCOUQ6rZ4m+WIF8MQXJWht3JJRY+J5sy15017kKC5HQDxSemBbPokBD
12 | vX6UGHfqvVgZSRdC83fVp/WtcDIC36y5YPQc01XZ5dfrytfaOkjd/PWCsUnX
13 | pRkytYh+1OwPnD0j3/JsBlomf3V3STlnjdXY4qC/6isIFXya3D1fn/vrtbVF
14 | kXga30Jw4nPEnHuQCdwtwxm09ZOxy0/8RNTwov2oErg19Jcxhn6OUs0Ccm5z
15 | DxVmZtTJpmHTIeLdaYL5r2ONoYciHikj3z6NQ2DrQ3s4AVQTMWayA8V0ENto
16 | aSI="}
--------------------------------------------------------------------------------
/tasks/check_high_play_count_data_source_v_qq.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Sep 5 17:52:53 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq
9 | from crawler_sys.utils.output_results import output_result
10 | from crawler_sys.utils import Metaorphosis as meta
11 | from crawler_sys.utils.output_log import output_log
12 |
13 | logging = output_log(page_category='video_page',
14 | program_info='tencent')
15 |
16 | def tran_input_data_to_lst(file_name, file_category='csv'):
17 | if file_category == 'csv':
18 | video_info_lst = meta.csv_to_lst_whth_headline(file_name)
19 | url_lst = []
20 | for line in video_info_lst:
21 | try:
22 | if line['data_provider'] == 'CCR':
23 | url_lst.append(line['url'])
24 | except:
25 | pass
26 | return url_lst
27 | elif file_category == 'file':
28 | url_lst = meta.str_file_to_lst(file_name)
29 | return url_lst
30 |
31 | url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file')
32 |
33 | crawler = Crawler_v_qq()
34 | get_video_page = crawler.video_page
35 |
36 | def get_data_source(url_lst=url_lst,
37 | output_to_file=False,
38 | filepath=None,
39 | output_to_es_raw=False,
40 | output_to_es_register=False,
41 | push_to_redis=False,
42 | output_es_index=None,
43 | output_doc_type=None):
44 | result_lst = []
45 | for url in url_lst:
46 | video_info = get_video_page(url=url)
47 | result_lst.append(video_info)
48 | logging.info('get_data at page %s' % url)
49 | if len(result_lst) >= 100:
50 | if output_es_index is not None and output_doc_type is not None:
51 | output_result(result_lst,
52 | platform='腾讯视频',
53 | output_to_file=output_to_file,
54 | output_to_es_raw=output_to_es_raw,
55 | output_to_es_register=output_to_es_register,
56 | push_to_redis=push_to_redis,
57 | es_index=output_es_index,
58 | doc_type=output_doc_type)
59 | result_lst.clear()
60 | else:
61 | output_result(result_lst,
62 | platform='腾讯视频',
63 | output_to_file=output_to_file,
64 | output_to_es_raw=output_to_es_raw,
65 | output_to_es_register=output_to_es_register,
66 | push_to_redis=push_to_redis)
67 | result_lst.clear()
68 | if len(result_lst) != []:
69 | if output_es_index is not None and output_doc_type is not None:
70 | output_result(result_lst,
71 | platform='腾讯视频',
72 | output_to_file=output_to_file,
73 | output_to_es_raw=output_to_es_raw,
74 | output_to_es_register=output_to_es_register,
75 | push_to_redis=push_to_redis,
76 | es_index=output_es_index,
77 | doc_type=output_doc_type)
78 | result_lst.clear()
79 | else:
80 | output_result(result_lst,
81 | platform='腾讯视频',
82 | output_to_file=output_to_file,
83 | output_to_es_raw=output_to_es_raw,
84 | output_to_es_register=output_to_es_register,
85 | push_to_redis=push_to_redis)
86 | result_lst.clear()
87 |
88 | if __name__ == '__main__':
89 | get_data_source(output_to_es_raw=True,
90 | output_es_index='test2',
91 | output_doc_type='fyc')
--------------------------------------------------------------------------------
/tasks/crontab/192.168.18.11/etc/crontab:
--------------------------------------------------------------------------------
1 | SHELL=/bin/bash
2 | PATH=/sbin:/bin:/usr/sbin:/usr/bin
3 | MAILTO=hanye
4 | HOME=/
5 |
6 | # For details see man 4 crontabs
7 |
8 | # Example of job definition:
9 | # .---------------- minute (0 - 59)
10 | # | .------------- hour (0 - 23)
11 | # | | .---------- day of month (1 - 31)
12 | # | | | .------- month (1 - 12) OR jan,feb,mar,apr ...
13 | # | | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
14 | # | | | | |
15 | # * * * * * user-name command to be executed
16 |
17 | # 1 update video data in target release index daily
18 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -n 10 -s 10
19 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -n 30 -s 10
20 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p iqiyi -n 30 -s 10
21 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -n 10 -s 10
22 |
23 | # 2 write crawled data into short-video-prodtion index daily
24 | 0 05 * * * hanye python3 /home/hanye/crawlers/tasks/update_DU_ATU_from_crawler_raw.py
25 |
26 | # 3 create redis url batch
27 | 0 18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30
28 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30
29 |
30 | # 4 scrap redis url list
31 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p 腾讯视频 -b 02
32 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p iqiyi -b 02
33 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages.py -p youku -n 30
34 |
35 | # 5 scrap list pages
36 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p iqiyi -n 30
37 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p youku -n 30
38 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p 腾讯视频 -n 30
39 | 0 7,10,15 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p toutiao -n 160 -s 10
40 |
41 | # update haokan target releaser
42 | 0 0,8,12,18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_single_thread.py -p haokan -n 20
43 |
44 | # get tencent news search page
45 | 0 0,4,8,10,14,16,20 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/search_page_single_process.py -p 腾讯新闻
46 |
47 | #high frequency releasers ,crawler executes every hour, only for haokan currently
48 | 0 0,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/high_fre_releasers.py -p haokan
49 |
50 | #high frequency releasers. execute on 1 pm and 6pm
51 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -fre 3 -n 20 -s 15
52 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -fre 3 -n 20 -s 15
53 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -fre 3 -n 20 -s 15
54 |
--------------------------------------------------------------------------------
/tasks/download_video_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec 14 15:03:05 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 |
13 |
14 | """
15 | only for platform v_qq, iqiyi, and new_tudou
16 | """
17 |
18 | step = 'download_video_page'
19 | data_cate = 'video_url'
20 |
21 | def download_video_page(platform):
22 | crawler_initialization = get_crawler(platform)
23 | crawler = crawler_initialization()
24 | if platform == '腾讯视频':
25 | key = 'v_qq_url_dict'
26 | else:
27 | key= "%s_%s" % (platform, data_cate)
28 | while True:
29 | if connect_with_redis.length_of_set(key) > 0:
30 | # pid_num = connect_with_redis.length_of_set(key=key_set)
31 | # if pid_num < 20:
32 | # process_num = int(20-pid_num)
33 | crawler.download_video_page_async_multi_process()
34 | # else:
35 | # print("%s processes is working on %s" % (pid_num, platform))
36 | # time.sleep(20)
37 | else:
38 | print("no %s url [dict] in redis" % platform)
39 | time.sleep(300)
40 |
41 | v_qq = Process(target=download_video_page, args=('腾讯视频',))
42 | iqiyi = Process(target=download_video_page, args=('iqiyi',))
43 | new_tudou = Process(target=download_video_page, args=('new_tudou',))
44 |
45 | v_qq.start()
46 | iqiyi.start()
47 | new_tudou.start()
--------------------------------------------------------------------------------
/tasks/fix_album_play_count_by_getting_releaser_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Nov 23 13:48:33 2018
4 |
5 | input video url, output video play_count gotten from releaser page
6 |
7 | @author: fangyucheng
8 | """
9 |
10 | from elasticsearch import Elasticsearch
11 | from crawler.crawler_sys.utils.trans_format import lst_to_csv
12 | from crawler.crawler_sys.utils import trans_format
13 | from crawler.crawler_sys.site_crawler import crawler_v_qq
14 |
15 | absolute_file_path = r"C:\Users\zhouyujiang\安徽第一周数据情况.csv"
16 | task_list = trans_format.str_file_to_lst(absolute_file_path)
17 |
18 | result_lst = []
19 | crawler = crawler_v_qq.Crawler_v_qq()
20 | for url in task_list:
21 | get_data = crawler.video_page(url)
22 | result_lst.append(get_data)
23 | print("get data at %s" % url)
24 |
25 | bug_releaser_list = []
26 | releaserUrl_lst = []
27 | revised_lst = []
28 | for line in result_lst:
29 | try:
30 | if line['releaserUrl'] is not None:
31 | releaserUrl = line['releaserUrl']
32 | if releaserUrl not in releaserUrl_lst:
33 | releaserUrl_lst.append(releaserUrl)
34 | try:
35 | crawler.releaser_page(releaserUrl, output_to_es_raw=True,
36 | es_index='test2', doc_type='12zjbfl',
37 | releaser_page_num_max=1000)
38 | print ("get releaser data at %s" % releaserUrl)
39 | except:
40 | bug_releaser_list.append(releaserUrl)
41 | else:
42 | pass
43 | else:
44 | print("this video %s can't find releaser" % line['url'])
45 | except:
46 | print("can't get releaser at %s" % url)
47 |
48 | hosts = '192.168.17.11'
49 | port = 80
50 | user_id = 'fangyucheng'
51 | password = 'VK0FkWf1fV8f'
52 | http_auth = (user_id, password)
53 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
54 |
55 | unsolve_lst = []
56 | result_lst2 =[]
57 |
58 | for line in task_list:
59 | try:
60 | if type(line) == dict:
61 | url = line['url']
62 | elif type(line) == str:
63 | url = line
64 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}}
65 | search = es_connection.search(index="test2", doc_type="12zjbfl", body=search_body)
66 | if search["hits"]["total"] == 0:
67 | unsolve_lst.append(url)
68 | print("can not get video data at %s" % url)
69 | else:
70 | video_data = search["hits"]["hits"][0]["_source"]
71 | result_lst2.append(video_data)
72 | print("get playcount at %s" % url)
73 | except:
74 | pass
75 |
76 | lst_to_csv(listname=result_lst2,
77 | csvname=r"C:\Users\zhouyujiang\12121212121.csv")
--------------------------------------------------------------------------------
/tasks/follower_num.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/follower_num.csv
--------------------------------------------------------------------------------
/tasks/from_es_find_crawler_task.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/6/21 14:33
3 | # @Author : litao
4 |
--------------------------------------------------------------------------------
/tasks/parse_list_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec 14 10:32:28 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 |
13 |
14 | """
15 | only for platform v_qq, iqiyi and youku
16 | """
17 |
18 | step = 'parse_list_page'
19 | data_cate = 'list_page_html'
20 |
21 | def parse_list_page(platform):
22 | crawler_initialization = get_crawler(platform)
23 | crawler = crawler_initialization()
24 | if platform == '腾讯视频':
25 | platform = 'v_qq'
26 | key_lst = "%s_%s" % (platform, data_cate)
27 | # key_set = "%s_%s" % (platform, step)
28 | while True:
29 | if connect_with_redis.length_of_lst(key=key_lst) > 0:
30 | crawler.parse_list_page_multi_process()
31 | else:
32 | print("no %s list page html in redis" % platform)
33 | time.sleep(300)
34 |
35 | v_qq = Process(target=parse_list_page, args=('腾讯视频',))
36 | iqiyi = Process(target=parse_list_page, args=('iqiyi',))
37 | youku = Process(target=parse_list_page, args=('youku',))
38 |
39 | v_qq.start()
40 | iqiyi.start()
41 | youku.start()
42 |
--------------------------------------------------------------------------------
/tasks/parse_video_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Dec 14 15:23:29 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | from multiprocessing import Process
10 | from crawler.crawler_sys.utils import connect_with_redis
11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
12 |
13 | """
14 | for platform v_qq, iqiyi, and new_tudou
15 | """
16 |
17 | kwarg_dict = {'output_to_es_raw': True,
18 | 'es_index': 'crawler-data-raw',
19 | 'doc_type': 'doc',
20 | 'output_to_es_register': True}
21 |
22 | step = 'parse_video_page'
23 | data_cate = 'video_page_html'
24 |
25 | def parse_video_page(platform, para_dic):
26 | crawler_initialization = get_crawler(platform)
27 | crawler = crawler_initialization()
28 | if platform == '腾讯视频':
29 | platform = 'v_qq'
30 | key= "%s_%s" % (platform, data_cate)
31 | while True:
32 | if connect_with_redis.length_of_lst(key) > 0:
33 | crawler.parse_video_page_multi_process(para_dic)
34 | else:
35 | print("no %s video page html in redis" % platform)
36 | time.sleep(300)
37 |
38 | v_qq = Process(target=parse_video_page, args=('腾讯视频', kwarg_dict))
39 | iqiyi = Process(target=parse_video_page, args=('iqiyi', kwarg_dict))
40 | new_tudou = Process(target=parse_video_page, args=('new_tudou', kwarg_dict))
41 |
42 | v_qq.start()
43 | iqiyi.start()
44 | new_tudou.start()
--------------------------------------------------------------------------------
/tasks/produce_releaser_report_from_es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/4/24 17:51
3 | # @Author : litao
4 | # 提供账号和平台,生成数据报告
5 | # 数据维度:发布量、播放量、粉丝量、评论量、点赞量,视频url、时长、发布时间、发布账号
6 |
7 | # import time
8 | import json
9 | # import argparse
10 | import datetime
11 | from elasticsearch import Elasticsearch
12 | import pandas as pd
13 | from elasticsearch.helpers import scan
14 | from func_find_week_num import find_week_belongs_to
15 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
16 | from crawler.crawler_sys.utils import trans_format
17 | from func_cal_doc_id import cal_doc_id
18 |
19 | hosts = '192.168.17.11'
20 | port = 80
21 | user = 'zhouyujiang'
22 | passwd = '8tM9JDN2LVxM'
23 | http_auth = (user, passwd)
24 |
25 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
26 |
27 |
28 | def func_search_reUrl_from_target_index(platform, releaser):
29 | search_body = {
30 | "query": {
31 | "bool": {
32 | "filter": [
33 | {"term": {"platform.keyword": platform}},
34 | {"term": {"releaser.keyword": releaser}}
35 | ]
36 | }
37 | }
38 | }
39 | search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
40 | if search_re['hits']['total'] > 0:
41 | return search_re['hits']['hits'][0]['_source']['releaserUrl']
42 | else:
43 | print('Can not found:', platform, releaser)
44 | return None
45 |
46 | data_dic = {}
47 | miaopai_list = []
48 | file = r'D:\work_file\无锡台内容数据需求.csv'
49 | with open(file, 'r')as f:
50 | header_Lst = f.readline().strip().split(',')
51 | for line in f:
52 | line_Lst = line.strip().split(',')
53 | line_dict = dict(zip(header_Lst, line_Lst))
54 | releaser = line_dict['releaser']
55 | platform = line_dict['platform']
56 | # releaserUrl = func_search_reUrl_from_target_index(platform, releaser)
57 | releaserUrl = 1
58 | if releaserUrl != None:
59 | re_list = []
60 | search_body = {
61 | "query": {
62 | "bool": {
63 | "filter": [
64 | {"term": {"platform.keyword": platform}}, {"term": {"releaser.keyword": releaser}},
65 | {"range": {"release_time": {"gte": 1546272000000, "lt": 1554048000000}}},
66 | {"range": {"fetch_time": {"gte": 1556150400000}}}
67 | ]
68 | }
69 | }
70 | }
71 |
72 | scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',
73 | query=search_body, scroll='3m')
74 | for one_scan in scan_re:
75 | "发布者,平台,标题,url,播放量,点赞量,评论量,时长,发布时间"
76 | data_dic[cal_doc_id(platform, url=one_scan["_source"]["url"], doc_id_type='all-time-url')]=[one_scan["_source"]["releaser"],one_scan["_source"]["platform"],one_scan["_source"]["title"],one_scan["_source"]["url"],one_scan["_source"]["play_count"],one_scan["_source"]["favorite_count"],one_scan["_source"]["comment_count"],one_scan["_source"]["duration"],datetime.datetime.fromtimestamp(one_scan["_source"]["release_time"]/1000).strftime('%Y-%m-%d %H:%M:%S')]
77 | data_lis = []
78 | print(len(data_dic))
79 | for d in data_dic:
80 | data_lis.append(data_dic[d])
81 |
82 | data = pd.DataFrame(data_lis)
83 | data.to_csv('./%s.csv' % "无锡台内容数据需求2", encoding="ansi")
84 |
85 |
86 |
--------------------------------------------------------------------------------
/tasks/sync_mayun_code_to_100.97.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2019/9/29 11:53
3 | # @Author : litao
4 | import subprocess,time
5 |
6 | f = open("/home/hanye/crawlersNew/crawler/tasks/log", "w", encoding="utf-8")
7 | while True:
8 | try:
9 | cmd0 = r'git pull origin master'
10 | cmd1 = r'git push gitLab master'
11 | cmdpath = r'/home/hanye/crawlersNew/v-scope2'
12 | process = subprocess.Popen(cmd0, shell=True, cwd=cmdpath)
13 | process.wait()
14 | result = process.returncode
15 | print(result,file=f)
16 | process = subprocess.Popen(cmd1, shell=True, cwd=cmdpath)
17 | process.wait()
18 | result = process.returncode
19 | print(result,file=f)
20 | #time.sleep(43200)
21 | time.sleep(70)
22 | except Exception as e:
23 | f.write("eroor",e)
24 | f.flush()
25 |
26 | f.close()
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/tasks/the_key_of_pid_in_redis.txt:
--------------------------------------------------------------------------------
1 | [parse_list_page]
2 | v_qq = v_qq_parse_list_page
3 | youku = youku_parse_list_page
--------------------------------------------------------------------------------
/tasks/yangjingshu.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/yangjingshu.csv
--------------------------------------------------------------------------------
/tasks/zhangminghui_dec.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec.csv
--------------------------------------------------------------------------------
/tasks/zhangminghui_dec2.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec2.csv
--------------------------------------------------------------------------------
/test/qq_news/remove_unrelated_parameter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Nov 26 13:14:03 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import time
9 | import urllib
10 | import requests
11 |
12 | domain = 'http://r.inews.qq.com/searchMore?'
13 | #domain_simple = 'http://r.inews.qq.com/searchMore'
14 | headers = {"Host": "r.inews.qq.com",
15 | "Accept-Encoding": "gzip,deflate",
16 | "Referer": "http://inews.qq.com/inews/android/",
17 | "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)",
18 | "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;",
19 | "Connection": "Keep-Alive"}
20 |
21 | url_dic = {'isoem':'0',
22 | 'mid': '74b9305504a047ab0a1901e2dfbf71f87f799819',
23 | 'dpi': '270',
24 | 'devid': '008796749793280',
25 | 'is_chinamobile_oem': '0',
26 | 'mac': 'mac%20unknown',
27 | 'real_device_width':'5.06',
28 | 'store': '17',
29 | 'screen_height': '1440',
30 | 'real_device_height': '9.0',
31 | 'apptype': 'android',
32 | 'origin_imei': '008796749793280',
33 | 'orig_store': '17',
34 | 'hw': 'etease_MuMu',
35 | 'appver': '23_android_5.4.10',
36 | 'uid': '54767d8bf41ac9a4',
37 | 'screen_width':'810',
38 | 'sceneid':'',
39 | 'omgid': '818b2ebf4abcec4bc1c8bf737a1c131dede60010213210',
40 | 'timeline':'1540802766',
41 | 'query':'espn',
42 | 'activefrom': 'icon',
43 | 'qqnetwork': 'wifi',
44 | 'rom_type': '',
45 | 'secId': '2',
46 | 'Cookie':'lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;',
47 | 'network_type': 'wifi',
48 | 'id': '20181029A18Y3H00',
49 | 'global_info': '1|0|0|0|1|1|1|1|0|6|1|1|1|1|0|J060P000000000:B054P000011803|1402|0|0|-1|-1|0|0|0||-1|-1|0|0|1|1|0|0|-1|0|2|0|2|0|0|0|0|0|0|0|0|2|0|0|0|0',
50 | 'imsi_history':'0,460013199570862',
51 | 'omgbizid': 'a520b26ce7880445ab488481e3dd4949c74f0050213210',
52 | 'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47',
53 | 'qn-sig': 'c022edce8ae72f053304412f13a9bb88',
54 | 'page': "2",
55 | 'type': "0",
56 | 'imsi': '460013199570862'}
57 |
58 | #sig = 'c022edce8ae72f053304412f13a9bb88'
59 |
60 | url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic)
61 |
62 | """
63 | raw-url
64 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862'
65 | """
66 |
67 | get_page = requests.get(url, headers=headers)
68 | page = get_page.text
69 |
70 | key_lst = []
71 | para_lst = []
72 |
73 | for key, value in url_dic.items():
74 | key_lst.append(key)
75 |
76 | for key in key_lst:
77 | value = url_dic[key]
78 | url_dic.pop(key)
79 | url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic)
80 | get_page = requests.get(url, headers=headers)
81 | page = get_page.text
82 | time.sleep(10)
83 | if len(page) > 5000:
84 | print("%s can be moved from url, length of page is %s" % (key, len(page)))
85 | continue
86 | else:
87 | url_dic[key] = value
88 | print("key %s, value %s can't be moved from url" % (key, value))
89 |
90 | #this the result
91 | final_url_dic = {'devid': '008796749793280',
92 | 'appver': '23_android_5.4.10',
93 | 'query': 'espn',
94 | 'qn-rid': '3e3cb605-3a00-412d-8b2e-f81b32f5064c',
95 | 'qn-sig': '589a406e354aa3bfb6ddeaa778278ef1'}
96 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic)
97 |
98 | """
99 | final_url
100 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88'
101 | """
102 |
103 | qn_sig = "01552dc75351b12d01b5310441a562e2"
104 | qn_rid = "8f14a809-db96-4762-905a-7c063dc84ac7"
105 |
--------------------------------------------------------------------------------
/test/test_read_config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Dec 4 16:23:48 2018
4 |
5 | @author: fangyucheng
6 | """
7 |
8 | import configparser
9 |
10 | config = configparser.ConfigParser()
11 | config.read('/home/hanye/crawlerNEW/crawler/crawler_sys/framework/config/search_keywords.ini')
12 |
--------------------------------------------------------------------------------