├── .gitignore ├── .idea ├── crawler.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── crawler_log └── log.txt ├── crawler_sys ├── __init__ .py ├── framework │ ├── __init__.py │ ├── config │ │ ├── high_fre.ini │ │ ├── key_customer.ini │ │ ├── legal_platforms.ini │ │ ├── list_page_urls.ini │ │ ├── redis_key_register.ini │ │ ├── search_keywords - 副本.ini │ │ └── search_keywords.ini │ ├── update_data_in_redis_multi_process_auto_task.py │ ├── update_data_in_target_releasers_single_thread.py │ ├── video_fields_std.py │ ├── write_releaserUrl_to_crawler_redis.py │ └── write_releasers_to_redis.py ├── hot_words_crawler │ ├── __init__.py │ ├── crawler_douyin.py │ ├── crawler_haokan.py │ ├── crawler_kwai.py │ ├── crawler_main_task.py │ ├── crawler_new_tudou.py │ ├── crawler_qq_news.py │ ├── crawler_toutiao.py │ ├── crawler_v_qq.py │ └── crawler_wangyi_news.py ├── proxy_pool │ └── func_get_proxy_form_kuaidaili.py ├── site_crawler │ ├── __init__.py │ ├── chromedriver │ ├── crawler.new_tudouAPP.py │ ├── crawler.pepper.py │ ├── crawler_Mango.py │ ├── crawler_baidu.py │ ├── crawler_baijiahao.py │ ├── crawler_bilibili.py │ ├── crawler_bing.py │ ├── crawler_douyin.py │ ├── crawler_haokan.py │ ├── crawler_iqiyi.py │ ├── crawler_kwai.py │ ├── crawler_mango.py │ ├── crawler_miaopai.py │ ├── crawler_pear.py │ ├── crawler_peoplevideo.py │ ├── crawler_shenma.py │ ├── crawler_sogou.py │ ├── crawler_tencent_news.py │ ├── crawler_toutiao.py │ ├── crawler_tudou.py │ ├── crawler_v_qq.py │ ├── crawler_v_qq_doc.py │ ├── crawler_wangyi_news.py │ ├── crawler_watermelon.py │ ├── crawler_weibo │ │ ├── cookie_pool │ │ ├── crawler_weibo.py │ │ ├── ghostdriver.log │ │ └── login.py │ ├── crawler_ximalaya.py │ ├── crawler_xinhua.py │ ├── crawler_youku.py │ ├── func_get_releaser_id.py │ └── toutiao_get_signature.py ├── site_crawler_by_redis │ ├── __init__.py │ ├── chromedriver │ ├── crawler_baijiahao.py │ ├── crawler_douyin.py │ ├── crawler_haokan.py │ ├── crawler_kwai.py │ ├── crawler_tencent_news.py │ ├── crawler_toutiao.py │ ├── crawler_tudou.py │ ├── crawler_v_qq.py │ ├── crawler_wangyi_news.py │ └── toutiao_article.py ├── site_crawler_test │ ├── __init__.py │ ├── crawler_baijiahao.py │ ├── crawler_douyin.py │ ├── crawler_haokan.py │ ├── crawler_kwai.py │ ├── crawler_tencent_news.py │ ├── crawler_toutiao.py │ ├── crawler_tudou.py │ ├── crawler_v_qq.py │ └── crawler_wangyi_news.py ├── special_propose │ ├── __init__.py │ ├── crawler.v_qq_comment.py │ ├── crawler.v_qq_danmu.py │ ├── eastnews.py │ ├── get_weibo_user_info.py │ ├── tocsv.py │ ├── watcheast.py │ └── whether_user_exists.py ├── tools │ ├── add_releasers_video_data.py │ ├── add_releasers_video_data_multi_process.py │ ├── add_search_keywords_to_es.py │ ├── add_target_releasers_by_file.py │ ├── count_releaser_publish_num_and_refresh_crawler_frequency.py │ ├── craw_data_and_write_into_alltime_index.py │ ├── craw_data_and_write_into_alltime_index_by_date.py │ ├── craw_data_and_write_into_monthly_index.py │ ├── craw_data_and_write_into_monthly_index_remove_duplication.py │ ├── craw_data_and_write_into_weekly_index.py │ ├── craw_data_and_write_into_weekly_index_remove_duplication.py │ ├── crawler_url_video_info.py │ ├── error.log │ ├── find_high_fre_releaser.py │ ├── match_play_count_by_video_title.py │ ├── match_play_count_by_video_url.py │ ├── ocr_by_aliyun.py │ ├── ocr_by_img.py │ ├── tudou_selenium.py │ ├── video_num_count.rar │ ├── video_num_count │ │ ├── .idea │ │ │ ├── .gitignore │ │ │ ├── misc.xml │ │ │ ├── modules.xml │ │ │ ├── vcs.xml │ │ │ └── video_num_count.iml │ │ ├── 2019-09-10.csv │ │ ├── count.csv │ │ ├── count_main.py │ │ ├── crawler │ │ │ ├── 2019-09-10.csv │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── crawler_tencent_news.cpython-37.pyc │ │ │ │ ├── crawler_wangyi_news.cpython-37.pyc │ │ │ │ ├── func_get_releaser_id.cpython-37.pyc │ │ │ │ ├── haokan_count.cpython-37.pyc │ │ │ │ ├── qq_video_count.cpython-37.pyc │ │ │ │ ├── toutiao_count.cpython-37.pyc │ │ │ │ ├── trans_duration_str_to_second.cpython-37.pyc │ │ │ │ ├── trans_strtime_to_timestamp.cpython-37.pyc │ │ │ │ └── tudou_count.cpython-37.pyc │ │ │ ├── crawler_tencent_news.py │ │ │ ├── crawler_wangyi_news.py │ │ │ ├── func_get_releaser_id.py │ │ │ ├── haokan_count.py │ │ │ ├── haokan_青春旅社.csv │ │ │ ├── qq_video_count.py │ │ │ ├── toutiao_count.py │ │ │ ├── trans_duration_str_to_second.py │ │ │ ├── trans_strtime_to_timestamp.py │ │ │ ├── tudou_count.py │ │ │ └── 首次运行.bat │ │ └── haokan_看看新闻Knews.csv │ ├── write_data_to_targert_doc.py │ ├── write_feihua_json_to_es.py │ └── 头条重复数据导出.py └── utils │ ├── __init__ .py │ ├── connect_with_es.py │ ├── connect_with_redis.py │ ├── date_calculator.py │ ├── delete_repeat_data.py │ ├── extract_data_from_es.py │ ├── fakechrome.py │ ├── fix_album_play_count_by_releaser_page.py │ ├── func_verification_code.py │ ├── get_toutiao_as_cp_signature.py │ ├── output_log.py │ ├── output_results.py │ ├── parse_bool_for_args.py │ ├── releaser_url_check.py │ ├── remove_unrelated_parameter_tudou.py │ ├── trans_duration_str_to_second.py │ ├── trans_format.py │ ├── trans_str_play_count_to_int.py │ ├── trans_strtime_to_timestamp.py │ ├── util_logging.py │ ├── write_into_database.py │ ├── write_into_file.py │ ├── write_lsturl_info_conf.py │ ├── write_releasers_into_ini_from_csv.py │ └── write_search_keyword_info_conf.py ├── dev ├── CCTV_live │ ├── bilibili.py │ ├── toutiao.py │ └── yangshipin.py ├── Documentary_project │ ├── bilibili.py │ ├── iqiyi.py │ ├── tengxun.py │ └── youku.py ├── Movies_rank_project │ ├── crawler_douban.py │ └── crawler_maoyan.py ├── iqiyi_debug │ ├── test_aiohttp.py │ └── test_iqiyi_list_url.py └── post_dict.txt ├── tasks ├── check_high_play_count_data_source_v_qq.py ├── crontab │ └── 192.168.18.11 │ │ └── etc │ │ └── crontab ├── download_video_page.py ├── fix_album_play_count_by_getting_releaser_page.py ├── follower_num.csv ├── from_es_find_crawler_task.py ├── parse_list_page.py ├── parse_video_page.py ├── produce_releaser_report_from_es.py ├── sync_mayun_code_to_100.97.py ├── the_key_of_pid_in_redis.txt ├── update_DU_ATU_from_crawler_raw.py ├── write_key_releaser_to_week_doc_weekly.py ├── yangjingshu.csv ├── zhangminghui_dec.csv └── zhangminghui_dec2.csv └── test ├── qq_news └── remove_unrelated_parameter.py └── test_read_config.py /.gitignore: -------------------------------------------------------------------------------- 1 | /crawler_sys/framework/check_wrong_url_daily.py 2 | *.pyc 3 | -------------------------------------------------------------------------------- /.idea/crawler.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 16 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawler 2 | 爬虫项目: 3 | 主要爬取抖音,好看,快手,头条,土豆,网易新闻,qq视频等短视频数据 4 | 5 | 爬虫主要逻辑代码在 6 | ~~~ 7 | crawler_sys/site_crawler_by_redis 8 | ~~~ 9 | 10 | 调度工具 11 | ~~~ 12 | crawler_sys/framework 13 | ~~~ 14 | -------------------------------------------------------------------------------- /crawler_log/log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_log/log.txt -------------------------------------------------------------------------------- /crawler_sys/__init__ .py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue June 6 18:02:43 2018 4 | 5 | @author: hanye 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /crawler_sys/framework/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 15 13:59:44 2018 4 | 5 | 6 | @author: hanye 7 | """ 8 | -------------------------------------------------------------------------------- /crawler_sys/framework/config/high_fre.ini: -------------------------------------------------------------------------------- 1 | [haokan] 2 | 看看新闻knews = https://haokan.baidu.com/haokan/wiseauthor?app_id=1565285080839434 3 | 一手video = https://haokan.baidu.com/haokan/wiseauthor?app_id=1546617034936582 4 | 生活欢乐汇 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462926329612 5 | 消息直升机 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022360094300 6 | 万物杂谈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022359827373 7 | 横漂一姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611821585605765 8 | 吃瓜少女萌 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822155649253 9 | 人人视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1563996947927117 10 | 追剧小师妹 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822302181315 11 | 新娱乐萌主 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611553722549281 12 | 探剧全能王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1610928916930156 13 | 青春影剧场 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822943891552 14 | 肥仔电影圈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1601813907799169 15 | 芒果tv = https://haokan.baidu.com/haokan/wiseauthor?app_id=1549963812551792 16 | 科技观察猿 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462927568986 17 | 撩剧大师 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1607132398047134 18 | 欧阳华北 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1547961620896856 19 | 澎湃新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1574072008111128 20 | 娱乐不晚点 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022346878545 21 | 看剧小资姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594361771699213 22 | 热剧宅急送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353698233619 23 | 毒舌影视街 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195426989529 24 | 江西网络广播电视台 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1577139229517527 25 | 热剧乐翻天 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594362729953997 26 | 全球视频大魔王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1597149386529756 27 | 精彩剧集大放送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593533634618523 28 | 影视水煮鱼 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594257498968349 29 | 财经新鲜事 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1555591483551002 30 | 酷哥撩大剧 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593525911983865 31 | 咸鱼说片 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594356024003023 32 | 安徽海豚播报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611201539330357 33 | 看剧大球球 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593542564661281 34 | 长沙政法频道 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1552122607183011 35 | 体坛先锋报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022347820284 36 | 综艺杂货铺 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195671591267 37 | 视频展览馆 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195775370668 38 | 钱江视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1602058994708441 39 | 动漫铲屎官 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353045050780 40 | 荔枝新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1558731555412280 41 | 武侠超新世界 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1613833957799396 -------------------------------------------------------------------------------- /crawler_sys/framework/config/key_customer.ini: -------------------------------------------------------------------------------- 1 | [haokan] 2 | 经视大直播 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610033451422491 3 | 湖北卫视资讯站 = https://haokan.hao123.com/haokan/wiseauthor?app_id=3513 4 | 湖北经视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1573243419235544 5 | 湖北卫视长江新闻号 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1588754674509461 6 | 湖北卫视非正式会谈 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609570087894225 7 | 非正式课堂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1596696077980564 8 | 大王小湖北卫视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609563211193403 9 | 长江云 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1604613511192710 10 | 调解面对面life = https://haokan.hao123.com/haokan/wiseauthor?app_id=1578861535741379 11 | 我为喜剧疯狂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610026230407548 12 | 湖北调解现场 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609575047119026 -------------------------------------------------------------------------------- /crawler_sys/framework/config/legal_platforms.ini: -------------------------------------------------------------------------------- 1 | [legal_platforms_to_update_production] 2 | legal_platforms = 腾讯新闻,kwai,miaopai,new_tudou,toutiao,haokan,腾讯视频,网易新闻,pearvideo,央视新闻+,人民日报,看了吗,youtube,facebook,新华社,youku,iqiyi,,bilibili,抖音,toutiao_microheadlines,toutiao_article 3 | -------------------------------------------------------------------------------- /crawler_sys/framework/config/list_page_urls.ini: -------------------------------------------------------------------------------- 1 | [new_tudou] 2 | 推荐 = http://www.tudou.com/api/getfeeds?secCateId=10016&utdid=T8v9EQPOimUCAXL%2FAz0YrDOB&page_size=24 3 | 乐活 = http://www.tudou.com/api/getfeeds?secCateId=10195&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 4 | 搞笑 = http://www.tudou.com/api/getfeeds?secCateId=622736331&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 5 | 影视 = http://www.tudou.com/api/getfeeds?secCateId=622769673&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 6 | 动漫 = http://www.tudou.com/api/getfeeds?secCateId=10116&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 7 | 身边 = http://www.tudou.com/api/getfeeds?secCateId=622621940&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 8 | 综娱 = http://www.tudou.com/api/getfeeds?secCateId=10198&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 9 | 音乐 = http://www.tudou.com/api/getfeeds?secCateId=622336449&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 10 | 游戏 = http://www.tudou.com/api/getfeeds?secCateId=10051&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24 11 | 12 | [腾讯视频] 13 | 音乐 = http://v.qq.com/x/list/music 14 | 新闻 = http://v.qq.com/x/list/news 15 | 军事 = http://v.qq.com/x/list/military 16 | 娱乐 = http://v.qq.com/x/list/ent 17 | 体育 = http://v.qq.com/x/list/sports 18 | 游戏 = http://v.qq.com/x/list/games 19 | 搞笑 = http://v.qq.com/x/list/fun 20 | 时尚 = http://v.qq.com/x/list/fashion 21 | 生活 = http://v.qq.com/x/list/life 22 | 母婴 = http://v.qq.com/x/list/baby 23 | 汽车 = http://v.qq.com/x/list/auto 24 | 科技 = http://v.qq.com/x/list/tech 25 | 教育 = http://v.qq.com/x/list/education 26 | 财经 = http://v.qq.com/x/list/finance 27 | 房产 = http://v.qq.com/x/list/house 28 | 旅游 = http://v.qq.com/x/list/travel 29 | 王者荣耀 = http://v.qq.com/x/list/kings 30 | 31 | [toutiao] 32 | #only one list page and don't rely on list page url 33 | list_url = toutiao 34 | 35 | [iqiyi] 36 | 纪录片 = http://list.iqiyi.com/www/3/-------------4-fangyucheng-2-iqiyi--.html 37 | 游戏 = http://list.iqiyi.com/www/8/-------------4-fangyucheng-2-iqiyi--.html 38 | 资讯 = http://list.iqiyi.com/www/25/-------------4-fangyucheng-2-iqiyi-1-.html 39 | 娱乐 = http://list.iqiyi.com/www/7/-------------4-fangyucheng-2-iqiyi-1-.html 40 | 财经 = http://list.iqiyi.com/www/24/-------------4-fangyucheng-2-iqiyi--.html 41 | 片花-电影 = http://list.iqiyi.com/www/10/1007-------------4-fangyucheng-2--1-.html 42 | 片花-电视剧 = http://list.iqiyi.com/www/10/1006-------------4-fangyucheng-2--1-.html 43 | 音乐 = http://list.iqiyi.com/www/5/-------------4-fangyucheng-2-iqiyi--.html 44 | 军事 = http://list.iqiyi.com/www/28/-------------4-fangyucheng-2-iqiyi-1-.html 45 | 教育 = http://list.iqiyi.com/www/12/-------------4-fangyucheng-2-iqiyi-1-.html 46 | 体育 = http://list.iqiyi.com/www/17/-------------4-fangyucheng-2-iqiyi--.html 47 | 48 | [youku] 49 | 全部 = http://list.youku.com/category/video/c_0_d_1_s_2_p_fangyucheng.html 50 | 音乐 = http://list.youku.com/category/show/c_95_s_5_d_1_p_fangyucheng.html 51 | 资讯 = http://list.youku.com/category/video/c_91_d_1_s_2_p_fangyucheng.html 52 | 搞笑 = http://list.youku.com/category/video/c_94_d_1_s_2_p_fangyucheng.html 53 | 生活 = http://list.youku.com/category/video/c_103_d_1_s_2_p_fangyucheng.html 54 | 汽车 = http://list.youku.com/category/video/c_104_d_1_s_2_p_fangyucheng.html 55 | 科技 = http://list.youku.com/category/video/c_105_d_1_s_2_p_fangyucheng.html 56 | 时尚 = http://list.youku.com/category/video/c_89_d_1_s_2_p_fangyucheng.html 57 | 亲子 = http://list.youku.com/category/video/c_90_d_1_s_2_p_fangyucheng.html 58 | 旅游 = http://list.youku.com/category/video/c_88_d_1_s_2_p_fangyucheng.html 59 | 微电影 = http://list.youku.com/category/video/c_171_d_1_s_2_p_2_fangyucheng.html 60 | 网剧 = http://list.youku.com/category/video/c_172_d_1_s_2_p_2_fangyucheng.html 61 | 拍客 = http://list.youku.com/category/video/c_174_d_1_s_2_p_2_fangyucheng.html 62 | 创意视频 = http://list.youku.com/category/video/c_175_d_1_s_2_p_2_fangyucheng.html 63 | 自拍 = http://list.youku.com/category/video/c_176_d_1_s_2_p_2_fangyucheng.html 64 | 广告 = http://list.youku.com/category/video/c_102_d_1_s_2_p_2_fangyucheng.html -------------------------------------------------------------------------------- /crawler_sys/framework/config/redis_key_register.ini: -------------------------------------------------------------------------------- 1 | [v_qq] 2 | list_page_html = v_qq_list_page_html 3 | -------------------------------------------------------------------------------- /crawler_sys/framework/config/search_keywords - 副本.ini: -------------------------------------------------------------------------------- 1 | # For every keyword, there should be an search_pages value, in the same order 2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword, 3 | # space cannot be just stripped out. 4 | 5 | [腾讯新闻] 6 | keywords = 致敬中国英雄 7 | search_pages = 20 8 | 9 | [腾讯视频] 10 | keywords = 致敬中国英雄 11 | search_pages = 20 12 | 13 | [new_tudou] 14 | keywords = 致敬中国英雄 15 | search_pages = 20 16 | 17 | [toutiao] 18 | keywords = 致敬中国英雄 19 | search_pages = 20 20 | 21 | [youku] 22 | keywords = 致敬中国英雄 23 | search_pages = 2 24 | 25 | [pearvideo] 26 | keywords = 任正非 BBC 27 | search_pages = 2 28 | 29 | [bilibili] 30 | keywords = 任正非 BBC 31 | search_pages = 2 32 | -------------------------------------------------------------------------------- /crawler_sys/framework/config/search_keywords.ini: -------------------------------------------------------------------------------- 1 | # For every keyword, there should be an search_pages value, in the same order 2 | # There should be NO SPACE around comma, because in some cases, there will be space within a keyword, 3 | # space cannot be just stripped out. 4 | 5 | [腾讯新闻] 6 | keywords = 2019东方卫视春晚,2019东方卫视跨年演唱会,BesTV百视通,SMG摄界,SMG阳阳STUDIO,第一财经,第一财经《财经早班车》,东方卫视,东方卫视-东方新闻,东方卫视-看东方,东方午新闻,动感101TV,话匣子,酱紫娱乐,交叉点看,究竟视频,剧说有毒,看东方,看看新闻Knews,可凡倾听,青春旅社,上海电视台新闻综合频道,新闻坊,游戏风云gamefy,最美公路,北京时间,时间财经,时间测评,时间国际视频,时间新闻,时间新闻视频,时间直播,大揭秘,大王小王湖北卫视,非正式课堂,湖北电视台《钓鱼频道》,湖北广电纪录片部,湖北经视,湖北调解面对面,经视大直播,经视好吃佬,经视乐生活,经视人家,调解现场,问新闻,笑啦,长江新闻号,长江新闻号湖北卫视,长江云,《奇兵神犬》,《我是大侦探》,风影车都,湖南电视剧频道,湖南电视台《风影车都》,湖南电影频道,湖南都市频道,湖南广播电视台茶频道,湖南经视,湖南卫视,湖南卫视芒果云,湖南卫视天天向上,湖南娱乐,幻乐之城,金鹰卡通卫视,快乐垂钓频道,芒果V直播,芒果都市,平民英雄,亲爱的客栈,亲爱的客栈第2季,我家那小子,我是未来,我想和你唱,欲望都市show,中餐厅,中餐厅第二季,江苏卫视官方帐号,江苏卫视幸福剧场,江苏新闻,江苏新闻广播,金曲捞,荔枝新闻,南京零距离,无限歌谣季,新闻眼,缘来非诚勿扰,动历史,老板联播,梨北京,梨青岛,梨视频,梨视频ING直播,梨视频微视,梨视频游戏,一手Video,澎湃视频,澎湃新闻,第一现场,深圳卫视,深圳卫视《军情直播间》,深圳卫视《正午30分》,深圳卫视军情直播间,深圳卫视正午30分,深圳卫视直播港澳台,正午30分,直播港澳台,新京报,新京报动新闻,新京报经济新闻,新京报书评周刊,1818黄金眼,2019浙江卫视领跑演唱会,FM988浙江新闻广播,奔跑吧兄弟 第4季,大冰小将,范大姐帮忙,钱江视频,熟悉的味道 第2季,喜剧总动员,喜剧总动员 第2季,小强实验室,异口同声,浙江广播电视台,浙江经视新闻,浙江台车行天下,浙江卫视,浙江卫视《新闻深一度》,浙江新闻频道,浙江之声,中国蓝TV,中国蓝新闻,中国蓝新闻蓝媒视频,看看新闻Knews,任正非 BBC 7 | search_pages = 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,30,30,2 8 | 9 | [腾讯视频] 10 | keywords = 任正非 BBC 11 | search_pages = 2 12 | 13 | [new_tudou] 14 | keywords = 任正非 BBC 15 | search_pages = 2 16 | 17 | [toutiao] 18 | keywords = 任正非 BBC 19 | search_pages = 2 20 | 21 | [youku] 22 | keywords = 任正非 BBC 23 | search_pages = 2 24 | 25 | [pearvideo] 26 | keywords = 任正非 BBC 27 | search_pages = 2 28 | 29 | [bilibili] 30 | keywords = 任正非 BBC 31 | search_pages = 2 32 | -------------------------------------------------------------------------------- /crawler_sys/framework/update_data_in_target_releasers_single_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 14 17:52:02 2018 4 | 5 | Find urls in given releaser page, and write first batch data into es. 6 | Everytime this program runs, two things will happen: 7 | 1 All video urls in given releaser page will be fetched and put into redis url pool, 8 | 2 All data related to 1 will be fetched and stored into es. 9 | 10 | Data in es will be update when run this program once. 11 | 12 | @author: hanye 13 | """ 14 | 15 | import argparse 16 | from crawler_sys.framework.platform_crawler_register import get_crawler 17 | from crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es 18 | from crawler_sys.utils.parse_bool_for_args import parse_bool_for_args 19 | 20 | parser = argparse.ArgumentParser(description='Specify a platform name.') 21 | parser.add_argument('-p', '--platform', default=[], action='append', 22 | help=('Pass platform names, they will be assembled in python list.')) 23 | parser.add_argument('-n', '--max_page', default=30, type=int, 24 | help=('The max page numbers to be scroll for each releaser url, ' 25 | 'must be an int value, default to 30.')) 26 | parser.add_argument('-f', '--output_file_path', default='', type=str, 27 | help=('Specify output file path, default None.')) 28 | parser.add_argument('-r', '--push_to_redis', default='False', type=str, 29 | help=('Write urls to redis or not, default to True')) 30 | parser.add_argument('-w', '--output_to_es_raw', default='True', type=str, 31 | help=('Write data into es or not, default to True')) 32 | parser.add_argument('-g', '--output_to_es_register', default='True', type=str, 33 | help=('Write data into es or not, default to True')) 34 | parser.add_argument('-l', '--releasers', default=[], action='append', 35 | help=('Write data into es or not, default to True')) 36 | parser.add_argument('-t', '--target_index', default="target_releasers", type=str, 37 | help=('target_releasers_org or target_releasers')) 38 | args = parser.parse_args() 39 | 40 | if args.platform != []: 41 | platforms = args.platform 42 | else: 43 | platforms = [ 44 | 'toutiao', 45 | '腾讯视频', 46 | 'iqiyi', 47 | 'youku', 48 | '腾讯新闻', 49 | 'haodkan', 50 | 'new_tudou', 51 | "kwai" 52 | ] 53 | releaser_page_num_max = args.max_page 54 | output_f_path = args.output_file_path 55 | if output_f_path == '': 56 | output_to_file = False 57 | else: 58 | output_to_file = True 59 | 60 | push_to_redis = parse_bool_for_args(args.push_to_redis) 61 | output_to_es_raw = parse_bool_for_args(args.output_to_es_raw) 62 | output_to_es_register = parse_bool_for_args(args.output_to_es_register) 63 | 64 | releaser_Lst = args.releasers 65 | 66 | for platform in platforms: 67 | # 2 get releaserUrl list on each platform from target-releasers index 68 | if not releaser_Lst: 69 | releaserUrl_Lst = get_releaserUrls_from_es(platform=platform,target_index=args.target_index) 70 | else: 71 | releaserUrl_Lst = [] 72 | for releaser in releaser_Lst: 73 | releaserUrl_Lst.extend(get_releaserUrls_from_es(platform=platform,target_index=args.target_index)) 74 | if releaserUrl_Lst == []: 75 | print('Get empty releaserUrl_Lst for platform %s' % platform) 76 | continue 77 | # 3 get crawler for this platform 78 | Platform_crawler = get_crawler(platform) 79 | if Platform_crawler != None: 80 | crawler_instant = Platform_crawler() 81 | else: 82 | print('Failed to get crawler for platform %s' % platform) 83 | continue 84 | # 4 for each releaserUrl, get data on the releaser page identified by this 85 | # releaser url 86 | for releaserUrl in releaserUrl_Lst: 87 | crawler_instant.releaser_page(releaserUrl[0], 88 | output_to_file=output_to_file, 89 | filepath=output_f_path, 90 | releaser_page_num_max=releaser_page_num_max, 91 | output_to_es_raw=output_to_es_raw, 92 | output_to_es_register=output_to_es_register, 93 | push_to_redis=push_to_redis 94 | ) 95 | -------------------------------------------------------------------------------- /crawler_sys/framework/video_fields_std.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 26 17:57:38 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | class Std_fields_video: 9 | def __init__(self, data_provider=None): 10 | if data_provider==None: 11 | data_provider='BDD' 12 | self.video_data={ 13 | 'platform': None, 14 | 'channel': None, 15 | # 'channel_url': None, 16 | # 'channel_subdomain': None, 17 | 'describe': None, 18 | 'title': None, 19 | 'url': None, 20 | 'duration': 0, 21 | 'releaser': None, 22 | 'play_count': None, 23 | 'favorite_count': 0, 24 | 'comment_count': 0, 25 | # 'dislike_count': None, 26 | 'repost_count': None, 27 | 'isOriginal': None, 28 | 'data_provider': data_provider, 29 | 'video_id': None, 30 | 31 | 'releaserUrl': None, 32 | 'release_time': 0, 33 | 'fetch_time': 0, 34 | } 35 | 36 | def field_type_correct(self): 37 | def none_is_allowed(field, field_type): 38 | if field not in self.video_data: 39 | return 'field "%s" is absent' % field 40 | is_correct=(isinstance(self.video_data[field], field_type) 41 | or self.video_data[field]==None) 42 | if is_correct: 43 | return True 44 | else: 45 | return 'field "%s" should be of [%s] type or None' % (field, field_type.__name__) 46 | 47 | def none_is_not_allowed(field, field_type): 48 | if field not in self.video_data: 49 | return 'FATAL: field "%s" is NOT FOUND!' % field 50 | else: 51 | is_correct=isinstance(self.video_data[field], field_type) 52 | if is_correct: 53 | return True 54 | else: 55 | return 'field "%s" should be of [%s] type' % (field, field_type.__name__) 56 | 57 | platform_chk=none_is_not_allowed('platform', str) 58 | duration_chk=none_is_not_allowed('duration', int) 59 | play_count_chk=none_is_not_allowed('play_count', int) 60 | favorite_count_chk=none_is_allowed('favorite_count', int) 61 | comment_count_chk=none_is_allowed('comment_count', int) 62 | # dislike_count_chk=none_is_allowed('dislike_count', int) 63 | repost_count_chk=none_is_allowed('repost_count', int) 64 | isOriginal_chk=none_is_allowed('isOriginal', bool) 65 | release_time_chk=none_is_allowed('release_time', int) 66 | fetch_time_chk=none_is_not_allowed('fetch_time', int) 67 | 68 | type_chk={ 69 | 'platform': platform_chk, 70 | 'duration': duration_chk, 71 | 'play_count': play_count_chk, 72 | 'favorite_count': favorite_count_chk, 73 | 'comment_count': comment_count_chk, 74 | # 'dislike_count': dislike_count_chk, 75 | 'repost_count': repost_count_chk, 76 | 'isOriginal': isOriginal_chk, 77 | 'release_time': release_time_chk, 78 | 'fetch_time': fetch_time_chk, 79 | } 80 | 81 | false_time_warning=[] 82 | for field in type_chk: 83 | if type_chk[field]!=True: 84 | false_time_warning.append(type_chk[field]) 85 | 86 | if false_time_warning==[]: 87 | return True 88 | else: 89 | warn_msg='WARNING: ' + '\n'.join(false_time_warning) 90 | return warn_msg 91 | 92 | 93 | 94 | if __name__=='__main__': 95 | a=Std_fields_video() 96 | a.video_data['platform']='腾讯视频' 97 | a.video_data['duration']=62 98 | a.video_data['title']='This is test video' 99 | 100 | # a.video_data={'duration': 62} 101 | print(a.field_type_correct()) 102 | -------------------------------------------------------------------------------- /crawler_sys/framework/write_releaserUrl_to_crawler_redis.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/9/11 11:48 3 | # @Author : litao 4 | import redis 5 | import json 6 | import datetime 7 | from redis.sentinel import Sentinel 8 | 9 | sentinel = Sentinel([('192.168.17.65', 26379), 10 | ('192.168.17.66', 26379), 11 | ('192.168.17.67', 26379) 12 | ], socket_timeout=0.5) 13 | # 查看master节点 14 | master = sentinel.discover_master('ida_redis_master') 15 | # 查看slave 节点 16 | slave = sentinel.discover_slaves('ida_redis_master') 17 | # 连接数据库 18 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=2, decode_responses=True) 19 | 20 | 21 | def write_project_to_redis(project): 22 | rds.rpush("project", project) 23 | 24 | 25 | def write_releaserUrl_to_redis(project, data,email_dic): 26 | project_name, duration = project.split("/") 27 | data_dict_for_redis = {"duration": duration, "data": json.dumps(data),"email":json.dumps(email_dic)} 28 | rds.hmset(project, data_dict_for_redis) 29 | write_project_to_redis(project) 30 | 31 | 32 | def read_csv_write_into_redis(project_name, csv_file, crawler_days,email_dic=None): 33 | try: 34 | crawler_lis = [] 35 | with open(csv_file, 'r', encoding="gb18030")as f: 36 | header_Lst = f.readline().strip().split(',') 37 | for line in f: 38 | line_Lst = line.strip().split(',') 39 | line_dict = dict(zip(header_Lst, line_Lst)) 40 | platform = line_dict['platform'] 41 | releaserUrl = line_dict['releaserUrl'] 42 | crawler_lis.append(platform+"&"+releaserUrl) 43 | write_releaserUrl_to_redis("{0}/{1}".format(project_name, str(crawler_days)), crawler_lis,email_dic=email_dic) 44 | return True 45 | except: 46 | return False 47 | 48 | 49 | def down_task(): 50 | res = rds.hgetall("task_down") 51 | if res: 52 | rds.delete("task_down") 53 | return res 54 | else: 55 | return None 56 | 57 | 58 | if __name__ == "__main__": 59 | # 传入的email_dic 格式如下 60 | mapping_dic = { 61 | "taskname": "127869453", 62 | "file_path": None, 63 | "data_str": None, 64 | "email_group": ["litao@csm.com.cn"], 65 | "email_msg_body_str": "任务已完成", 66 | "title_str": "任务已完成", 67 | "cc_group": [], 68 | "sender": "litao@csm.com.cn" 69 | } 70 | crawler_lis = [] 71 | file = r'D:\work_file\发布者账号\一次性需求附件\大客户总台垂类及主持人增加账号.csv' 72 | read_csv_write_into_redis("task15", file, "1577808000000",email_dic=mapping_dic) 73 | print(down_task()) 74 | # with open(file, 'r')as f: 75 | # header_Lst = f.readline().strip().split(',') 76 | # for line in f: 77 | # line_Lst = line.strip().split(',') 78 | # line_dict = dict(zip(header_Lst,line_Lst)) 79 | # platform = line_dict['platform'] 80 | # releaserUrl = line_dict['releaserUrl'] 81 | # crawler_lis.append(platform+"_"+releaserUrl) 82 | # write_releaserUrl_to_redis("project_name/1564588800000/3",crawler_lis) 83 | -------------------------------------------------------------------------------- /crawler_sys/framework/write_releasers_to_redis.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/4/24 14:15 3 | # @Author : litao 4 | # -*- coding: utf-8 -*- 5 | """ 6 | Created on Mon May 14 17:52:02 2018 7 | 8 | Find urls in given releaser page, and write first batch data into es. 9 | Everytime this program runs, two things will happen: 10 | 1 All video urls in given releaser page will be fetched and put into redis url pool, 11 | 2 All data related to 1 will be fetched and stored into es. 12 | 13 | Data in es will be update when run this program once. 14 | 15 | @author: hanye 16 | """ 17 | 18 | import sys 19 | import argparse,copy 20 | from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es 21 | from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg 22 | import redis,json 23 | 24 | 25 | from redis.sentinel import Sentinel 26 | sentinel = Sentinel([('192.168.17.65', 26379), 27 | ('192.168.17.66', 26379), 28 | ('192.168.17.67', 26379) 29 | ],socket_timeout=0.5) 30 | # 查看master节点 31 | master = sentinel.discover_master('ida_redis_master') 32 | # 查看slave 节点 33 | slave = sentinel.discover_slaves('ida_redis_master') 34 | # 连接数据库 35 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True) 36 | # rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True) 37 | 38 | parser = argparse.ArgumentParser(description='Specify a platform name.') 39 | parser.add_argument('-p', '--platform', default=[], action='append', 40 | help=('Pass platform names, they will be assembled in python list.')) 41 | parser.add_argument('-pj', '--project_tags', default=[], action='append', 42 | help=('Pass platform names, they will be assembled in python list.')) 43 | parser.add_argument('-n', '--max_page', default=2, type=int, 44 | help=('The max page numbers to be scroll for each releaser url, ' 45 | 'must be an int value, default to 30.')) 46 | parser.add_argument('-fre', '--frequency', default=1, type=int, 47 | help=('choose a frequency to retrieve releaserUrl,' 48 | '1, 3 or 9 is legal number, default 1')) 49 | parser.add_argument('-proxies', '--proxies', default=0, type=int, 50 | help=('Crawler proxies_num')) 51 | parser.add_argument('-d', '--date', default=3, type=int, 52 | help=('Crawler backtracking data time')) 53 | parser.add_argument('-s', '--processes_num', default=5, type=int, 54 | help=('Processes number to be used in multiprocessing')) 55 | parser.add_argument('-article', '--article', default=0, type=int, 56 | help=('is article page')) 57 | args = parser.parse_args() 58 | 59 | 60 | if args.platform != []: 61 | platforms = args.platform 62 | else: 63 | print('platform must be input') 64 | sys.exit(0) 65 | 66 | 67 | releaser_page_num_max = args.max_page 68 | frequency = args.frequency 69 | if frequency == '': 70 | frequency = None 71 | 72 | processes_num = args.processes_num 73 | frequency = args.frequency 74 | print(frequency) 75 | if frequency == 0: 76 | frequency = None 77 | 78 | 79 | kwargs_dict = { 80 | "proxies_num": 0, 81 | "date":args.date, 82 | } 83 | if frequency: 84 | if frequency >= 3: 85 | kwargs_dict["proxies_num"] = 3 86 | if args.proxies: 87 | kwargs_dict["proxies_num"] = args.proxies 88 | is_article = args.article 89 | 90 | def write_project_to_redis(platform, data): 91 | rds.rpush(platform, data) 92 | 93 | 94 | def write_releaserUrl_to_redis(data_dic): 95 | write_project_to_redis(data_dic["platform"], json.dumps(data_dic)) 96 | 97 | 98 | for platform in platforms: 99 | # 2 get releaserUrl list on each platform from target-releasers index 100 | releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers",project_tags=args.project_tags) 101 | if is_article: 102 | platform = platform + "_article" 103 | rds.hset("process_num",platform,processes_num) 104 | if releaserUrl_Lst == []: 105 | 106 | print('Get empty releaserUrl_Lst for platform %s' % platform) 107 | continue 108 | # 3 get crawler for this platform 109 | for releaserUrl,releaser in releaserUrl_Lst: 110 | push_dic = { 111 | "releaserUrl":releaserUrl, 112 | "releaser":releaser, 113 | "platform":platform, 114 | } 115 | push_dic.update(kwargs_dict) 116 | write_releaserUrl_to_redis(push_dic) 117 | -------------------------------------------------------------------------------- /crawler_sys/hot_words_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/2/25 15:20 3 | # @Author : litao 4 | -------------------------------------------------------------------------------- /crawler_sys/hot_words_crawler/crawler_main_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/3/5 15:34 3 | # @Author : litao 4 | import datetime 5 | from crawler.crawler_sys.hot_words_crawler.crawler_wangyi_news import Crawler_WangYi_News 6 | from crawler.crawler_sys.hot_words_crawler.crawler_v_qq import Crawler_v_qq 7 | from crawler.crawler_sys.hot_words_crawler.crawler_haokan import CrawlerHaoKan 8 | from crawler.crawler_sys.hot_words_crawler.crawler_douyin import Crawler_douyin 9 | from crawler.crawler_sys.hot_words_crawler.crawler_qq_news import Crawler_Qq_News 10 | from crawler.crawler_sys.hot_words_crawler.crawler_new_tudou import CrawlerNewTudou 11 | from crawler.crawler_sys.hot_words_crawler.crawler_toutiao import Crawler_toutiao 12 | from crawler.crawler_sys.hot_words_crawler.crawler_kwai import Crawler_kwai 13 | 14 | from elasticsearch import Elasticsearch 15 | from elasticsearch.helpers import scan 16 | 17 | es = Elasticsearch(hosts='192.168.17.11', port=80, 18 | http_auth=('crawler', 'XBcasfo8dgfs')) 19 | 20 | now = int(datetime.datetime.now().timestamp() * 1e3) - 86400000 21 | platform_dic = { 22 | "kwai": Crawler_kwai(), 23 | "toutiao": Crawler_toutiao(), 24 | "haokan": CrawlerHaoKan(), 25 | "抖音": Crawler_douyin(), 26 | "腾讯视频": Crawler_v_qq(), 27 | "腾讯新闻": Crawler_Qq_News(), 28 | "new_tudou": CrawlerNewTudou(), 29 | "网易新闻": Crawler_WangYi_News() 30 | } 31 | 32 | for platform in platform_dic: 33 | res = platform_dic[platform].get_hot_words() 34 | print(platform, res) 35 | 36 | search_body = { 37 | "query": { 38 | "bool": { 39 | "filter": [ 40 | # {"term":{"platform.keyword":"腾讯视频"}}, 41 | {"range": {"fetch_time": {"gte": now}}} 42 | ] 43 | } 44 | } 45 | } 46 | keyword_scan = scan(client=es, index='short-video-hotwords', query=search_body, doc_type="doc") 47 | res_list = [] 48 | for res in keyword_scan: 49 | res_list.append(res["_source"]) 50 | 51 | for res in res_list: 52 | try: 53 | platform_dic[res["platform"]].get_hot_videos(**res) 54 | except Exception as e: 55 | print(res["platform"], e) 56 | continue 57 | -------------------------------------------------------------------------------- /crawler_sys/hot_words_crawler/crawler_new_tudou.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/2/25 15:23 3 | # @Author : litao 4 | import requests 5 | import json, re, datetime,urllib 6 | from crawler.crawler_sys.utils.output_results import retry_get_url 7 | from crawler.crawler_sys.utils.output_results import hot_words_output_result 8 | 9 | 10 | class CrawlerNewTudou(object): 11 | def __init__(self): 12 | self.platform = "new_tudou" 13 | self.headers = { 14 | "Cookie": "", 15 | "User-Agent": "Tudou;6.39.1;Android;5.1.1;OPPO R11", 16 | "Accept-Encoding": "gzip,deflate", 17 | "Connection": "close", 18 | "Host": "apis.tudou.com", 19 | } 20 | def get_hot_words(self): 21 | bulk_list = [] 22 | 23 | url = "https://apis.tudou.com/search/v1/hot?_t_={0}&e=md5&_s_=9a4abf3a92efad0605f8e31481327014&operator=CHINA+MOBILE_46007&network=WIFI".format( 24 | int(datetime.datetime.now().timestamp())) 25 | res = retry_get_url(url,proxies=3,headers=self.headers) 26 | res_json = res.json() 27 | for title in res_json["result"]["search"]["data"]: 28 | dic = { 29 | "platform": self.platform, 30 | "title": title["keyword"], 31 | "fetch_time":int(datetime.datetime.now().timestamp()*1e3) 32 | } 33 | bulk_list.append(dic) 34 | hot_words_output_result(bulk_list) 35 | return True 36 | 37 | def get_hot_videos(self,title=None,max_page=10,**kwargs): 38 | page = 1 39 | while page <= max_page: 40 | get_dic = { 41 | "keyword": title, 42 | # "pid": "6c23a6957198fad2", 43 | # "guid": "2139ff131a8a7d9ef7d3014cc8b97010", 44 | "mac": "", 45 | "imei": "null", 46 | "ver": "6.39.1", 47 | "_t_": int(datetime.datetime.now().timestamp()), 48 | "e": "md5", 49 | # "_s_": "b905d3a9738d7d2f815687428563d8f7", 50 | "operator": "CHINA+MOBILE_46007", 51 | "network": "WIFI", 52 | "ftype": "0", 53 | "cateId": "0", 54 | "seconds": "0", 55 | "seconds_end": "0", 56 | "ob": "", 57 | "pg": str(page), 58 | "pz": "30", 59 | # "aaid": "1.58259884569785E+20", 60 | "brand": "OPPO", 61 | "btype": "OPPO+R11", 62 | "sdkver": "2", 63 | "apad": "0", 64 | # "utdid": "XkjV9GsfBysDACyQ2%2BiF8MOw", 65 | "srid": "1", 66 | "userType": "guest", 67 | } 68 | requests_res = retry_get_url("https://apis.tudou.com/search/v2/integration?%s"%urllib.parse.urlencode(get_dic),headers=self.headers,proxies=3) 69 | requests_json = requests_res.json() 70 | page += 1 71 | print(requests_json) 72 | for data in requests_json["results"]["ugc"]["data"]: 73 | print(data) 74 | 75 | 76 | if __name__ == "__main__": 77 | crawler = CrawlerNewTudou() 78 | # crawler.get_hot_words() 79 | crawler.get_hot_videos("范冰冰蛋糕裙") -------------------------------------------------------------------------------- /crawler_sys/hot_words_crawler/crawler_wangyi_news.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/3/2 11:07 3 | # @Author : litao 4 | # -*- coding:utf-8 -*- 5 | # @Time : 2020/2/28 12:09 6 | # @Author : litao 7 | 8 | 9 | import requests 10 | import json, re, datetime, urllib 11 | from crawler.crawler_sys.utils.output_results import retry_get_url 12 | from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result 13 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp 14 | from write_data_into_es.func_cal_doc_id import * 15 | import base64 16 | from crawler.crawler_sys.site_crawler.crawler_wangyi_news import Crawler_wangyi_news as Crawler_wy 17 | crawler_qq_video_page = Crawler_wy().video_page 18 | 19 | 20 | class Crawler_WangYi_News(object): 21 | def __init__(self): 22 | self.platform = "网易新闻" 23 | timestamp = int(datetime.datetime.now().timestamp() * 1e3) 24 | self.headers = { 25 | "data4-Sent-Millis": str(timestamp), 26 | "Add-To-Queue-Millis": str(timestamp), 27 | "User-D": "2zx5YfHmoBb72ayxYpQVUg==", 28 | "User-N": "HPcUw15+Yla9nvIP1c9vbqrHfvh/PCmpfK2DVDjsFFGhp4IV17bdU7hTwNc3Kfe3gVZiNSrnIe+bsZBFoMsbZQ==", 29 | "httpDNSIP": "101.71.145.130", 30 | "User-C": "5aS05p2h", 31 | "User-Agent": "NewsApp/32.1 Android/5.1.1 (OPPO/OPPO R11)", 32 | "X-NR-Trace-Id": "%s_355730601_866174725888628" % timestamp, 33 | "Host": "c.m.163.com", 34 | "Connection": "Keep-Alive", 35 | "Accept-Encoding": "gzip", 36 | } 37 | 38 | def get_hot_words(self): 39 | bulk_list = [] 40 | url = "http://c.m.163.com/nc/search/hotWord.html" 41 | page_res = retry_get_url(url, headers=self.headers, proxies=3, timeout=5) 42 | page_json = page_res.json() 43 | for data in page_json["hotWordList"]: 44 | title = data["searchWord"] 45 | if title: 46 | dic = { 47 | "platform": self.platform, 48 | "title": title, 49 | "fetch_time": int(datetime.datetime.now().timestamp() * 1e3), 50 | } 51 | bulk_list.append(dic) 52 | hot_words_output_result(bulk_list) 53 | return True 54 | 55 | 56 | def search_page(self,title): 57 | data_list = [] 58 | encodestr = base64.b64encode(title.encode('utf-8')) 59 | encodestr = str(encodestr, 'utf-8') 60 | url = "http://c.m.163.com/search/comp2/Kg%3D%3D/20/{0}.html?".format(encodestr) 61 | para = "deviceId=2zx5YfHmoBb72ayxYpQVUg%3D%3D&version=newsclient.32.1.android&channel=VDEzNDg2NDc5MDkxMDc%3D&canal=bmV3c19sZl9jcGFfMg%3D%3D&dtype=0&tabname=shipin&position=5YiX6KGo6aG26YOo&ts={0}&sign=Di3opZw%2FFIPDdgreSK4VCKlnMSpm6FPoel5LeY88RgZ48ErR02zJ6%2FKXOnxX046I&spever=FALSE&open=scheme_%E9%BB%98%E8%AE%A4&openpath=/video/VT5O1KVCO".format(str(int(datetime.datetime.now().timestamp()))) 62 | res = retry_get_url(url+para, headers=self.headers, timeout=5, proxies=3) 63 | page_text = res.json() 64 | for data in page_text["doc"]["result"]: 65 | print(data) 66 | data_list.append(data) 67 | output_result(result_Lst=data_list, 68 | platform=self.platform, 69 | output_to_es_raw=True, 70 | ) 71 | data_list.clear() 72 | ## sign和ts为加密字段 无法解决 73 | 74 | def get_hot_videos(self, max_page=10,**kwargs): 75 | pass 76 | 77 | 78 | if __name__ == "__main__": 79 | crawler = Crawler_WangYi_News() 80 | crawler.get_hot_words() 81 | crawler.search_page("患者私自出院散步") 82 | # crawler.get_hot_videos("https://v.qq.com/x/search/?q=%E6%95%99%E8%82%B2%E9%83%A8%E5%9B%9E%E5%BA%94%E6%89%A9%E5%A4%A7%E7%A1%95%E5%A3%AB%E5%92%8C%E4%B8%93%E5%8D%87%E6%9C%AC%E6%8B%9B%E7%94%9F&stag=12",channel="教育部回应扩大硕士和专升本招生") -------------------------------------------------------------------------------- /crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/9/12 10:19 3 | # @Author : litao 4 | 5 | """ 6 | 私密代理使用示例 7 | 接口鉴权说明: 8 | 目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。 9 | 所有方法均可添加关键字参数sign_type修改鉴权方式。 10 | """ 11 | import redis,random 12 | import kdl,requests 13 | 14 | from redis.sentinel import Sentinel 15 | 16 | sentinel = Sentinel([('192.168.17.65', 26379), 17 | ('192.168.17.66', 26379), 18 | ('192.168.17.67', 26379) 19 | ], socket_timeout=0.5) 20 | # 查看master节点 21 | master = sentinel.discover_master('ida_redis_master') 22 | # 查看slave 节点 23 | slave = sentinel.discover_slaves('ida_redis_master') 24 | # 连接数据库 25 | rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True) 26 | # rds = redis.StrictRedis(host='192.168.17.60', port=6378, db=7, decode_responses=True) 27 | def get_proxy_from_redis(): 28 | try: 29 | one_proxy = rds.randomkey() 30 | username = "hanye" 31 | password = "i9mmu0a3" 32 | 33 | proxies = { 34 | "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}, 35 | "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy} 36 | } 37 | return proxies 38 | except Exception as e: 39 | print(e) 40 | return None 41 | 42 | def func_get_proxy_to_redis(): 43 | # chance = random.random() 44 | auth = kdl.Auth("997803479675913", "er2siw6i58c61s387sqljvovoz8zybaq") 45 | client = kdl.Client(auth) 46 | 47 | # 获取订单到期时间, 返回时间字符串 48 | # expire_time = client.get_order_expire_time() 49 | # print("expire time", expire_time) 50 | 51 | # 获取ip白名单, 返回ip列表 52 | # ip_whitelist = client.get_ip_whitelist() 53 | # print("ip whitelist", ip_whitelist) 54 | 55 | # 设置ip白名单,参数类型为字符串或列表或元组 56 | # 成功则返回True, 否则抛出异常 57 | # client.set_ip_whitelist([]) 58 | # client.set_ip_whitelist("127.0.0.1, 192.168.0.139") 59 | # print(client.get_ip_whitelist()) 60 | # client.set_ip_whitelist(tuple()) 61 | 62 | # 提取私密代理ip, 第一个参数为提取的数量, 其他参数以关键字参数的形式传入(不需要传入signature和timestamp) 63 | # 具体有哪些参数请参考帮助中心: "https://help.kuaidaili.com/api/getdps/" 64 | # 返回ip列表 65 | # 注意:若您使用的是python2, 且在终端调用,或在文件中调用且没有加 "# -*- coding: utf-8 -*-" 的话 66 | # 传入area参数时,请传入unicode类型,如 area=u'北京,上海' 67 | # ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东') 68 | # print("dps proxy: ", ips) 69 | 70 | 71 | # 检测私密代理有效性: 返回 ip: true/false 组成的dict 72 | #ips = client.get_dps(1, sign_type='simple', format='json') 73 | # valids = client.check_dps_valid(ips) 74 | # print("valids: ", valids) 75 | 76 | # 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict 77 | ips = client.get_dps(1, format='json',dedup=1) 78 | seconds = client.get_dps_valid_time(ips) 79 | # print("seconds: ", seconds) 80 | for key in seconds: 81 | rds.set(key, key, ex=int(seconds[key]) - 3) 82 | 83 | # 获取计数版ip余额(仅私密代理计数版) 84 | # balance = client.get_ip_balance(sign_type='hmacsha1') 85 | # print("balance: ", balance) 86 | def proxy_test(proxies): 87 | page_url = "http://dev.kdlapi.com/testproxy/" 88 | headers = { 89 | "Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快 90 | } 91 | 92 | res = requests.get(url=page_url, proxies=proxies, headers=headers) 93 | # print(res.status_code) # 获取Reponse的返回码 94 | if res.status_code == 200: 95 | print(res.content.decode('utf-8')) # 获取页面内容 96 | 97 | def get_proxy_dic(max_proxies=None): 98 | if not max_proxies: 99 | max_proxies = 8 100 | try: 101 | res = rds.dbsize() 102 | except Exception as e: 103 | print("redis error") 104 | return None 105 | if res is None: 106 | return None 107 | if res < max_proxies: 108 | func_get_proxy_to_redis() 109 | return get_proxy_from_redis() 110 | else: 111 | return get_proxy_from_redis() 112 | 113 | def get_proxy(proxies_num=None): 114 | if proxies_num: 115 | proxies = get_proxy_dic(max_proxies=proxies_num) 116 | # print("get a IP %s" % str(proxies)) 117 | return proxies 118 | else: 119 | return None 120 | 121 | if __name__ == "__main__": 122 | proxy_pool_dic = get_proxy(11) 123 | print(proxy_pool_dic) 124 | proxy_test(proxy_pool_dic) 125 | print(get_proxy_from_redis()) -------------------------------------------------------------------------------- /crawler_sys/site_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 15 13:59:43 2018 4 | 5 | @author: hanye 6 | """ 7 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/chromedriver -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler.new_tudouAPP.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 10 17:52:08 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import datetime 10 | import json 11 | import requests 12 | from bs4 import BeautifulSoup 13 | 14 | def get_video(target): 15 | result = [] 16 | count = 0 17 | while len(result) < target and count < 100: 18 | listurl = 'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622736331' 19 | get_page = requests.get(listurl) 20 | get_page.encoding = 'utf-8' 21 | page = get_page.text 22 | print('get one page') 23 | page = page.replace('true', 'True') 24 | page = page.replace('false', 'False') 25 | page_dic = json.loads(page)['entity'] 26 | for line in page_dic: 27 | midstep = line['detail'] 28 | title = midstep['base_detail']['title'] 29 | playcount = midstep['video_detail']['vv_desc'] 30 | releaser = midstep['user_detail']['name'] 31 | releaserid = midstep['user_detail']['id'] 32 | videoid = midstep['video_detail']['video_id'] 33 | duration = midstep['video_detail']['duration'] 34 | url = 'http://new-play.tudou.com/v/'+videoid 35 | get_page = requests.get(url) 36 | page = get_page.text 37 | soup = BeautifulSoup(page, 'html.parser') 38 | rt_step1 = soup.find('div', {'class':'td-play__videoinfo__details-box__time'}) 39 | rt_step2 = rt_step1.text[:-2] 40 | release_time = int(datetime.datetime.strptime(rt_step2, 41 | '%Y-%m-%d %H:%M:%S').timestamp()*1e3) 42 | D0 = {"title":title, "releaser":releaser, "release_time":release_time, 43 | "duration":duration, 'releaserid':releaserid, 'playcount':playcount} 44 | if D0 not in result: 45 | result.append(D0) 46 | print('added one video') 47 | else: 48 | count += 1 49 | print('repetition') 50 | return result 51 | 52 | 53 | 54 | if __name__=='__main__': 55 | try1 = get_video(target=200) 56 | #{'旅行':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10293', 57 | #'科技':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=10199', 58 | #'娱乐':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622726317', 59 | #'萌物':'http://apis.tudou.com/homepage/v2/index/get_push.json?secCateId=622485153'} 60 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler.pepper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 10 11:06:53 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import requests 9 | import datetime 10 | 11 | 12 | 13 | class crawler_pepper(): 14 | 15 | def except_followers(self): 16 | rank=[] 17 | urls=['http://webh.huajiao.com/rank/recv?&type=day','http://webh.huajiao.com/rank/recv?&type=week','http://webh.huajiao.com/rank/recv?&type=all','http://webh.huajiao.com/rank/sun?&type=day','http://webh.huajiao.com/rank/sun?&type=week','http://webh.huajiao.com/rank/sun?&type=all','http://webh.huajiao.com/rank/user?&type=day','http://webh.huajiao.com/rank/user?&type=all','http://webh.huajiao.com/rank/send?&type=day','http://webh.huajiao.com/rank/send?&type=week','http://webh.huajiao.com/rank/send?&type=all'] 18 | for url in urls: 19 | get_page=requests.get(url) 20 | get_page.encoding='utf-8' 21 | page = get_page.text 22 | print(url) 23 | page_dic = eval(page)['data']['rank'] 24 | one_rank=[] 25 | for one_video in page_dic: 26 | category=url[29:33] 27 | timespan=url[-3:] 28 | name=one_video['nickname'] 29 | authorlevel=one_video['authorlevel'] 30 | level=one_video['level'] 31 | try: 32 | score=one_video['score'] 33 | except KeyError: 34 | score=None 35 | uid=one_video['uid'] 36 | currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3 37 | D0={'name':name,'level':level,'authorlevel':authorlevel,'score':score,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime} 38 | one_rank.append(D0) 39 | rank.append(one_rank) 40 | return rank 41 | 42 | def get_followers(self): 43 | rank=[] 44 | urls=['http://webh.huajiao.com/rank/followers?&type=day','http://webh.huajiao.com/rank/followers?&type=all'] 45 | for url in urls: 46 | get_page=requests.get(url) 47 | get_page.encoding='utf-8' 48 | page = get_page.text 49 | print(url) 50 | midstep = eval(page) 51 | page_dic = midstep['data']['rank'] 52 | one_rank=[] 53 | for one_video in page_dic: 54 | category='followers' 55 | timespan=url[-3:] 56 | name=one_video['nickname'] 57 | authorlevel=one_video['authorlevel'] 58 | level=one_video['level'] 59 | followers=one_video['followers'] 60 | uid=one_video['uid'] 61 | currenttime=datetime.datetime.timestamp(datetime.datetime.now())*1e3 62 | D0={'name':name,'level':level,'authorlevel':authorlevel,'followers':followers,'uid':uid,'category':category,'timespan':timespan,'acttime':currenttime} 63 | one_rank.append(D0) 64 | rank.append(one_rank) 65 | return rank 66 | 67 | 68 | if __name__=='__main__': 69 | ttt =crawler_pepper() 70 | followers=ttt.get_followers() 71 | others=ttt.except_followers() -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_baidu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Aug 21 16:58:37 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | import requests 10 | from bs4 import BeautifulSoup 11 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp 12 | 13 | 14 | headers = {'Host': 'www.baidu.com', 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 18 | 'Accept-Encoding': 'gzip, deflate, br', 19 | 'Cookie': 'BAIDUID=5EBFCC8E193341115A4A3C71960B63E7:FG=1; BIDUPSID=BD339F6B0442001D2528C4BFBCE098DB; PSTM=1500974423; BDUSS=RCY0lFRmJ4MDlMMU5xfkp4NWU3bUlTckJOZU03ZTB4UHdJbUpUeWlVZmhlT3haSVFBQUFBJCQAAAAAAAAAAAEAAABM1D8MZmFuZzExMDExNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOHrxFnh68RZc3; MCITY=-131%3A; BD_UPN=1352; H_PS_PSSID=; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=1; BD_CK_SAM=1; PSINO=2; BDRCVFR[gltLrB7qNCt]=mk3SLVN4HKm; pgv_pvi=525595648; pgv_si=s2288931840; Hm_lvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; Hm_lpvt_9f14aaa038bbba8b12ec2a4a3e51d254=1534841172; BD_HOME=1; sug=3; sugstore=0; ORIGIN=0; bdime=21110; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDSVRTM=225; BDRCVFR[C0p6oIjvx-c]=I67x6TjHwwYf0; BAIDUPH=tn=§rn=§ct=0', 20 | 'Connection': 'keep-alive', 21 | 'Upgrade-Insecure-Requests': '1', 22 | 'Cache-Control': 'max-age=0'} 23 | 24 | 25 | def baidu_info_page(keyword, max_page_num): 26 | result_lst = [] 27 | for page_num in range(0, max_page_num): 28 | search_url = ('https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd='+keyword+ 29 | '&x_bfe_rqs=03E80&tngroupname=organic_news&pn='+str(page_num*10)) 30 | get_page = requests.get(search_url, headers=headers) 31 | get_page.encoding = 'utf-8' 32 | page = get_page.text 33 | soup = BeautifulSoup(page, 'html.parser') 34 | info_lst = soup.find_all('div', {'class': 'result'}) 35 | print_page_num = page_num+1 36 | for line in info_lst: 37 | title = line.h3.a.text 38 | title = title.replace('\n', '') 39 | url = line.h3.a['href'] 40 | source_and_release_time = line.find('p', {'class': 'c-author'}).text 41 | source_and_release_time_lst = source_and_release_time.split('\xa0') 42 | source = source_and_release_time_lst[0] 43 | release_time_str = source_and_release_time_lst[-1] 44 | release_time = trans_strtime_to_timestamp(release_time_str) 45 | midstep_content = line.find('div', {'class': 'c-summary'}).text 46 | content = midstep_content.replace(source, '').replace(' ', '') 47 | content = content.replace('\xa0', '') 48 | source = source.replace('\n', '').replace('\t', '') 49 | content = content.replace('\n', '').replace('\t', '') 50 | whole_page = line.find('a', {'class': 'c-cache'})['href'] 51 | fast_open_whole_page = whole_page + '&fast=y' 52 | get_whole_page = requests.get(fast_open_whole_page, headers=headers) 53 | get_whole_page.encoding = 'gb18030' 54 | whole_page_html = get_whole_page.text 55 | fetch_time = int(time.time()*1000) 56 | info_dic = {'title': title, 57 | 'url': url, 58 | 'source': source, 59 | 'content': content, 60 | 'release_time': release_time, 61 | 'keyword': keyword, 62 | 'whole_page_html': whole_page_html, 63 | 'fetch_time': fetch_time} 64 | result_lst.append(info_dic) 65 | print('get data at page %s' % print_page_num) 66 | return result_lst 67 | 68 | if __name__ == '__main__': 69 | keyword = '中超' 70 | test_data = baidu_info_page(keyword, max_page_num=10) -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_bing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Aug 22 09:30:20 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import time 10 | import requests 11 | from bs4 import BeautifulSoup 12 | from crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp 13 | 14 | 15 | def bing_page(keyword, max_page_num): 16 | result_lst = [] 17 | for page_num in range(0, max_page_num): 18 | search_url = ('https://cn.bing.com/search?q=' + keyword + '&pc=MOZI&first=' 19 | + str(max_page_num*10) + '&FORM=PERE1') 20 | get_page = requests.get(search_url) 21 | get_page.encoding = 'utf-8' 22 | page = get_page.text 23 | soup = BeautifulSoup(page, 'html.parser') 24 | info_lst = soup.find_all('li', {'class': 'b_algo'}) 25 | print_page_num = page_num+1 26 | for line in info_lst: 27 | title = line.h2.a.text 28 | title = title.replace('\n', '') 29 | url = line.h2.a['href'] 30 | release_time_and_content = line.find('p').text 31 | release_time_and_content_lst = release_time_and_content.split('\u2002·\u2002') 32 | content = release_time_and_content_lst[-1] 33 | release_time_str = release_time_and_content_lst[0] 34 | release_time = trans_strtime_to_timestamp(release_time_str) 35 | get_whole_page_str = line.find('div', {'class': 'b_attribution'})['u'] 36 | get_whole_page_lst = get_whole_page_str.split('|') 37 | d_number = get_whole_page_lst[2] 38 | w_number = get_whole_page_lst[3] 39 | get_whole_page_url = ('http://cncc.bingj.com/cache.aspx?q=' + keyword + 40 | '&d=' + d_number + '&mkt=zh-CN&setlang=zh-CN&w=' 41 | + w_number) 42 | get_whole_page = requests.get(get_whole_page_url) 43 | whole_page_html = get_whole_page.text 44 | fetch_time = int(time.time()*1000) 45 | info_dic = {'title': title, 46 | 'url': url, 47 | 'content': content, 48 | 'release_time': release_time, 49 | 'keyword': keyword, 50 | 'whole_page_html': whole_page_html, 51 | 'fetch_time': fetch_time} 52 | result_lst.append(info_dic) 53 | print('get data at page %s' % print_page_num) 54 | return result_lst 55 | 56 | 57 | if __name__ == '__main__': 58 | keyword = '中超' 59 | test_data = bing_page(keyword, max_page_num=10) -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_miaopai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Sep 28 10:28:45 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import urllib 10 | import requests 11 | import json 12 | from bs4 import BeautifulSoup 13 | from crawler_sys.utils.output_results import retry_get_url 14 | from crawler_sys.utils.trans_str_play_count_to_int import trans_play_count 15 | from crawler.crawler_sys.utils.util_logging import logged 16 | try: 17 | from .func_get_releaser_id import * 18 | except: 19 | from func_get_releaser_id import * 20 | 21 | class Crawler_miaopai(): 22 | 23 | def get_releaser_follower_num(self, releaserUrl): 24 | if "www.yixia.com" in releaserUrl: 25 | get_page = retry_get_url(releaserUrl) 26 | get_page.encoding = 'utf-8' 27 | page = get_page.text 28 | soup = BeautifulSoup(page, 'html.parser') 29 | try: 30 | midstep_1 = soup.find('ul', {'class': 'bottomInfor'}) 31 | midstep_2 = midstep_1.find_all('li') 32 | for line in midstep_2: 33 | line_text = line.text 34 | if '粉丝' in line_text: 35 | follower_str = line_text.replace('粉丝', '') 36 | follower_num = trans_play_count(follower_str) 37 | print('%s follower number is %s' % (releaserUrl, follower_num)) 38 | return follower_num 39 | except: 40 | print("can't can followers") 41 | elif "n.miaopai.com" in releaserUrl: 42 | try: 43 | split_url = releaserUrl.split("personal/") 44 | suid = split_url[-1].replace('.htm', '').replace('.html', '').replace('htm', '') 45 | url = "https://n.miaopai.com/api/aj_user/space.json?suid=%s" % suid 46 | get_page = urllib.request.urlopen(url) 47 | page_bytes = get_page.read() 48 | page_str = page_bytes.decode("utf-8") 49 | page_dic = json.loads(page_str) 50 | follower_num = page_dic['data']['followers_count'] 51 | return follower_num 52 | except: 53 | print("can't can followers") 54 | 55 | 56 | def get_releaser_id(self, releaserUrl): 57 | return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) 58 | 59 | 60 | #encoding method 61 | @logged 62 | def releaser_page(self, releaserUrl, releaser_page_num_max=30): 63 | headers = {'Host': 'n.miaopai.com', 64 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 65 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 66 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 67 | 'Accept-Encoding': 'gzip, deflate, br', 68 | 'Connection': 'keep-alive', 69 | 'Cookie': 'aliyungf_tc=AQAAAIVvfVl0CgQAysVBfBViNUJYGG5C; Hm_lvt_e8fa5926bca558076246d7fb7ca12071=1545124849; Hm_lpvt_e8fa5926bca558076246d7fb7ca12071=1545124849', 70 | 'Upgrade-Insecure-Requests': '1', 71 | 'Cache-Control': 'max-age=0'} 72 | releaser_id = self.get_releaser_id(releaserUrl) 73 | page_num = 1 74 | while page_num <= releaser_page_num_max: 75 | url = ('https://n.miaopai.com/api/aj_user/medias.json?suid=%s&page=%s' 76 | % (releaser_id, page_num)) 77 | get_page = requests.get(url, headers=headers) 78 | get_page.encoding = 'utf-8' 79 | page = get_page.text 80 | page_dic = get_page.json() 81 | 82 | if __name__ == "__main__": 83 | releaserUrl = 'http://n.miaopai.com/personal/h~NjA~vSfoYLz1pchtm' 84 | test = Crawler_miaopai() 85 | p = test.get_releaser_follower_num(releaserUrl) 86 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_peoplevideo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 19 10:32:52 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import requests 10 | from bs4 import BeautifulSoup 11 | import json 12 | 13 | 14 | #两会:lh/_cl/0/30/ 15 | #访谈:ft/_cl/6/30/ 16 | #资讯:zx/_cl/29/20/ 17 | #视点:sd/_cl/29/20/ 18 | #全球:qq/_cl/7/30/ 19 | 20 | class people_video(): 21 | 22 | def video_page(self,url): 23 | get_page=requests.get(url) 24 | get_page.encoding='utf-8' 25 | page = get_page.text 26 | midstep1=json.loads(page) 27 | midstep2=midstep1['data']['article'] 28 | midstep3=midstep2['publish'] 29 | title=midstep2['title'] 30 | author=midstep2['author'] 31 | release_time=midstep2['publishTime'] 32 | playcount=midstep2['playNum'] 33 | dura=midstep2['duration'] 34 | duration_str=dura 35 | dl=duration_str.split(':') 36 | dl_int=[] 37 | for v in dl: 38 | v=int(v) 39 | dl_int.append(v) 40 | if len(dl_int) == 2: 41 | duration=dl_int[0]*60+dl_int[1] 42 | else: 43 | duration=dl_int[0]*3660+dl_int[1]*60+dl_int[2] 44 | releaser=midstep3['name'] 45 | D0={'title':title,'playcount':playcount,'releaser':releaser,'release_time':release_time,'duration':duration,'author':author,'url':url} 46 | return D0 47 | 48 | def list_page(self,partofurl,totalpage): 49 | urls=['http://mobilevideo.people.com.cn/movie_pub/News/publishfile/'+partofurl+'list_{}.json'.format(str(i)) for i in range(1,totalpage)] 50 | list_page=[] 51 | for url in urls: 52 | get_page=requests.get(url) 53 | get_page.encoding='utf-8' 54 | page=get_page.text 55 | soup=BeautifulSoup(page,'html.parser') 56 | try: 57 | selection=soup.html.head.title.text 58 | print ('no more page') 59 | except AttributeError: 60 | print(url) 61 | midstep1=json.loads(page) 62 | midstep2=midstep1['data']['newsList'] 63 | for one_line in midstep2: 64 | url=one_line['articleLink'] 65 | one_video_dic=self.video_page(url) 66 | list_page.append(one_video_dic) 67 | return list_page 68 | 69 | 70 | 71 | if __name__=='__main__': 72 | people_crawler = people_video() 73 | #video_page=people_video.video_page(url='http://mobilevideo.people.com.cn/movie_pub/News/publishfile/spk/_cd/10/18/4154954.json') 74 | list_page2=people_crawler.list_page(partofurl="qq/_cl/7/30/",totalpage=20) 75 | #search_page=iqiyi_crawler.search_page(keyword="国家相册") 76 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_sogou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Aug 22 09:30:20 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import time 10 | import requests 11 | from bs4 import BeautifulSoup 12 | from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp 13 | 14 | 15 | cookie = ('YYID=2FFBDAA6D4FBA37438F4067C8123E98B; IMEVER=8.5.0.1322;' 16 | 'SUID=3D03FF723865860A59795A5F000BB71F;' 17 | 'SUV=00C039A172FF033D5993ADBD770E7410; usid=lF0F7il0yWbXF5c9;' 18 | 'IPLOC=CN1100; sct=11; SMYUV=1512954490386200;' 19 | 'ad=19fxxkllll2zKxvnlllllVHr6$UllllltsDRlyllll9llllljgDll5@@@@@@@@@@;' 20 | 'SNUID=D0DE5A671A1E68C31FB628911B8277A5; wuid=AAGPcSphIAAAAAqLE2OSTQgAGwY=;' 21 | 'UM_distinctid=16449b02797449-0c5d9293f4a833-143f7040-1fa400-16449b02799881;' 22 | 'CXID=794EC592A14CE76F5DF3F3A3BDDDD787;' 23 | 'ld=Kyllllllll2bWX10QTIdJOHDsvSbWX1uK94Vhkllll9lllllVklll5@@@@@@@@@@;' 24 | 'cd=1534754086&17502a3f56c02f72dfd43a17cbb19663;' 25 | 'rd=Vyllllllll2bBEqoQLWCNCHfKv2bWX1uzX0atkllllwllllRVllll5@@@@@@@@@@;' 26 | 'LSTMV=173%2C72; LCLKINT=1570') 27 | 28 | 29 | headers = {'Host': 'news.sogou.com', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 32 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 33 | 'Accept-Encoding': 'gzip, deflate', 34 | 'Cookie': cookie, 35 | 'Connection': 'keep-alive', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'Cache-Control': 'max-age=0'} 38 | 39 | 40 | def sogou_info_page(keyword): 41 | result_lst = [] 42 | for page_num in range(1,11): 43 | search_url = 'http://news.sogou.com/news?&query='+keyword+'&page='+str(page_num) 44 | get_page = requests.get(search_url, headers=headers) 45 | page = get_page.text 46 | soup = BeautifulSoup(page, 'html.parser') 47 | news_lst = soup.find_all('div', {'class': 'vrwrap'}) 48 | for line in news_lst: 49 | try: 50 | title = line.div.h3.a.text 51 | url = line.div.h3.a['href'] 52 | source_and_release_time = line.find('p', {'class': 'news-from'}).text 53 | source_and_release_time_lst = source_and_release_time.split('\xa0') 54 | source = source_and_release_time_lst[0] 55 | release_time_str = source_and_release_time_lst[-1] 56 | release_time = trans_strtime_to_timestamp(release_time_str) 57 | try: 58 | content = line.find('span').text 59 | except: 60 | print('no content at %s' % title) 61 | content = 'missing' 62 | fetch_time = int(time.time()*1000) 63 | try: 64 | similar_news = line.find('a', {'id': 'news_similar'}).text 65 | except: 66 | print('no similar news at %s' % title) 67 | similar_news = 'missing' 68 | news_info = {'title': title, 69 | 'url': url, 70 | 'source': source, 71 | 'release_time': release_time, 72 | 'fetch_time': fetch_time, 73 | 'content': content, 74 | 'similar_news': similar_news, 75 | 'keyword': keyword} 76 | result_lst.append(news_info) 77 | print('get data at page %s' % page_num) 78 | except: 79 | ('the error occured at position %s' % news_lst.index(line)) 80 | return result_lst 81 | 82 | 83 | if __name__=='__main__': 84 | keyword = '中超' 85 | test_sogou = sogou_info_page(keyword) 86 | 87 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_watermelon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 28 10:29:57 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | 10 | 11 | import requests 12 | import json 13 | import datetime 14 | import re 15 | from framework.video_fields_std import Std_fields_video 16 | #from . import bulk_write_into_es 17 | import js2py 18 | import hashlib 19 | import time 20 | from selenium import webdriver 21 | 22 | 23 | class Crawler_Watermelon(Std_fields_video): 24 | 25 | def write_into_file(self, data_dict, file_obj): 26 | json_str=json.dumps(data_dict) 27 | file_obj.write(json_str) 28 | file_obj.write('\n') 29 | file_obj.flush() 30 | 31 | 32 | def feed_url_into_redis(self, dict_Lst): 33 | pass 34 | 35 | 36 | def output_result(self, result_Lst, output_to_file=False, filepath=None): 37 | # write data into es crawler-raw index 38 | #bulk_write_into_es(result_Lst) 39 | 40 | # feed url into redis 41 | self.feed_url_into_redis(result_Lst) 42 | 43 | # output into file according to passed in parameters 44 | if output_to_file==True and filepath!=None: 45 | output_fn='crawler_watermelon_%s_json' % datetime.datetime.now().isoformat()[:10] 46 | output_f=open(filepath+'/'+output_fn, 'a', encoding='utf-8') 47 | self.write_into_file(result_Lst, output_f) 48 | else: 49 | pass 50 | 51 | 52 | def get_list_video(self,output_to_file=False, filepath=None): 53 | result_Lst = [] 54 | max_behot_time = 0 55 | count = 0 56 | 57 | headers = {'Host': 'ic.snssdk.com', 58 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 59 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 60 | 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 61 | 'Accept-Encoding': 'gzip, deflate', 62 | 'Cookie': 'odin_tt=5b54e47f71b1963502fe03c4028f5672c887a0b739ce2302481beda2a4388a0a538ade820b54b4589da13d18dde9d245', 63 | 'Connection': 'keep-alive', 64 | 'Upgrade-Insecure-Requests': '1', 65 | 'Cache-Control': 'max-age=0'} 66 | 67 | 68 | while count <= 0: 69 | time_now = int(time.time()) 70 | listurl = 'http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&max_behot_time='+str(max_behot_time)+'&list_entrance=main_tab&last_refresh_sub_entrance_interval='+str(time_now) 71 | #http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&list_entrance=main_tab&last_refresh_sub_entrance_interval=1527473360&loc_mode=5&tt_from=refresh_auto&play_param=codec_type%3A0&iid=33815381012&device_id=52965120460&ac=wifi&channel=wandoujia&aid=32&app_name=video_article&version_code=653&version_name=6.5.3&device_platform=android&ab_version=359940%2C344692%2C353539%2C356329%2C361439%2C324397%2C361311%2C358091%2C358364%2C356602%2C350431%2C354439%2C325211%2C346575%2C342302%2C361530%2C320651%2C361551&ssmix=a&device_type=MuMu&device_brand=Android&language=zh&os_api=19&os_version=4.4.4&uuid=008796749793280&openudid=54767d8bf41ac9a4&manifest_version_code=253&resolution=1280*720&dpi=240&update_version_code=65307&_rticket=1527473360674&rom_version=cancro-eng+4.4.4+V417IR+eng.root.20180201.174500+release-keys&fp=i2T_FYmuPzL5Fl4ZcrU1FYFeL2FW 72 | 73 | get_page = requests.get(listurl,headers=headers) 74 | page = get_page.text 75 | page = page.replace('true','True') 76 | page = page.replace('false','False') 77 | page = page.replace('null','"Null"') 78 | page_dic = eval(page) 79 | video_agg = page_dic['data'] 80 | count += 1 81 | for line in video_agg: 82 | try: 83 | video_str=line['content'] 84 | video_dic=eval(video_str) 85 | if video_dic['has_video']==True: 86 | title = video_dic['title'] 87 | url = video_dic['display_url'] 88 | browser = webdriver.Chrome() 89 | browser.get(url) 90 | pc_midstep = browser.find_element_by_class_name('num').text 91 | play_count = ' '.join(re.findall('\d+',pc_midstep)) 92 | 93 | release_time = int(video_dic['publish_time']*1e3) 94 | play_count2 = video_dic['read_count'] 95 | releaser = video_dic['media_name'] 96 | max_behot_time = video_dic['behot_time'] 97 | video_id = video_dic['item_id'] 98 | releaser_id = video_dic['user_info']['user_id'] 99 | fetch_time = int(datetime.datetime.now().timestamp()*1e3) 100 | 101 | D0={'title':title,'url':url,'release_time':release_time,'releaser':releaser,'play_count':play_count, 102 | 'video_id':video_id,'releaser_id':releaser_id,'fetch_time':fetch_time,'play_count2':play_count2} 103 | 104 | result_Lst.append(D0) 105 | print ('get one video') 106 | except: 107 | pass 108 | browser.close() 109 | self.output_result(result_Lst,output_to_file=output_to_file,filepath=filepath) 110 | return result_Lst 111 | #result_Lst.clear() 112 | 113 | if __name__=='__main__': 114 | test=Crawler_Watermelon() 115 | output_to_file = True 116 | filepath = 'D:/CSM3.0/爬虫结果/watermelon' 117 | gogogo = test.get_list_video(output_to_file,filepath) -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_weibo/cookie_pool: -------------------------------------------------------------------------------- 1 | ['_T_WM=9ff1a76df3fb08baf991c83c0ecfdc93', 'SSOLoginState=1565062981', 'SUHB=0S7yXlcJlzydaX', 'SCF=AhdzjhXNYWCBvPKwVWrXOk-g-JBS9ojbzjFzZ-ppbfmgNLRAwcuir4WiJVxmdGuC4EPxXqTVzYsP2sVEbUHXojg.', 'SUB=_2A25wTIcVDeRhGeFM7lcY8ivEwzyIHXVTzildrDV6PUJbkdANLUjikW1NQN3cyJOFrdgYFfLOWkaG5qHFYk_I9rk1'] 2 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_weibo/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler/crawler_weibo/ghostdriver.log -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_weibo/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/8/6 10:47 3 | # @Author : litao 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | import sys,os 9 | 10 | class WeiboLogin(): 11 | def __init__(self, username, password): 12 | # os.system('pkill -f phantom') 13 | chrome_options = webdriver.ChromeOptions() 14 | # chrome_options.add_argument('--headless') 15 | # chrome_options.add_argument('--disable-gpu') 16 | # driver = webdriver.Remote(command_executor='http://192.168.18.11:4444/wd/hub', 17 | # desired_capabilities=DesiredCapabilities.CHROME) 18 | self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://weibo.cn/' 19 | self.browser = webdriver.Chrome(r'../chromedriver.exe', options=chrome_options) 20 | #self.browser.set_window_size(1050, 840) 21 | self.wait = WebDriverWait(self.browser, 20) 22 | self.username = username 23 | self.password = password 24 | 25 | def open(self): 26 | """ 27 | 打开网页输入用户名密码并点击 28 | :return: None 29 | """ 30 | self.browser.get(self.url) 31 | username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName'))) 32 | password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword'))) 33 | submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction'))) 34 | username.send_keys(self.username) 35 | password.send_keys(self.password) 36 | submit.click() 37 | 38 | def run(self): 39 | """ 40 | 破解入口 41 | :return: 42 | """ 43 | self.open() 44 | WebDriverWait(self.browser, 30).until( 45 | EC.title_is('我的首页') 46 | ) 47 | cookies = self.browser.get_cookies() 48 | cookie = [item["name"] + "=" + item["value"] for item in cookies] 49 | cookie_str = '; '.join(item for item in cookie) 50 | self.browser.quit() 51 | return str(cookie_str) 52 | 53 | if __name__ == '__main__': 54 | user_name = '13910233534' 55 | password = 'Lemo1995' 56 | cookie_str = WeiboLogin(user_name, password).run() 57 | print(cookie_str,type(cookie_str)) 58 | with open("./cookie_pool","a",encoding="utf-8") as f: 59 | f.write(cookie_str+"\n") 60 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler/crawler_xinhua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 6 21:50:08 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import requests 10 | 11 | url = 'https://xhpfmapi.zhongguowangshi.com/v500/core/indexlist' 12 | 13 | headers = {"Accept-Encoding": "*", 14 | "Content-Type": "application/json; charset=UTF-8", 15 | "Content-Length": "870", 16 | "Host": "xhpfmapi.zhongguowangshi.com", 17 | "Connection": "Keep-Alive", 18 | "User-Agent": "okhttp/3.10.0"} 19 | 20 | 21 | post_dict = ('{"param":"vPCnptKLNQB9Bldt193BqMWKnVNcVmaQ4pcJ6k8iV5wmESJSidV85U3rbdOK' 22 | 'e+jmLQUDdj8Ca9QuYHizMT6l7Vq3CKOBgtNieEbRQihk10GoWKkS+L9BWI9M ' 23 | 'd7Iq3IT/HL4saSsu6fkEe3PYjuA6EXKGPF6EThfdaQ3CyGPP+KhVFGp3C1Fk ' 24 | '9U50pbC0HflUGTK1iH7U7A9ZKNjhNd07/U49uddfqZ8OFbZjBw4mwRij0tGP ' 25 | 'F5jXpIBKoNsSnpWNTQnL86VCLd7+9jmQ+PzCkdmEiYNqRPYzXv+ihAhvBNY9 ' 26 | 'yg/18dVE1+zwHR685iDwabVLWppSxacQJr7iz1uuc6O0hoIWiUBkhDcUsD8S ' 27 | 'O3/DXZ8PgLMEvagO4TfS4AqMy/n5rW5UwzYr+x8jKDpr0kNjrUUH2vvXlQr6 ' 28 | '+TfqwS5qHm6+nGCLoyv7HyMoOmvSraDmg3OD66rGTol/Ri9NYlSrNpyYemGG ' 29 | 'fAuUuZ1pIzNeTzpF02TvzF2OQ8T2iROkmUK8iSXAuViE4I+KNKq959APFJ6Y ' 30 | 'Fs+i7nfZ0d2/5jonslonNYfHqAtotAuciwEW+fqy9aZpHMRQaU5XjUeVV0Sk ' 31 | 'dKiY2SveT59VTiOBgi8fy+q096BYfC+vPBp780AE0A8UnpgWz6LdyyP4GR0J ' 32 | 'JTVN6F4TiPnD5mA7Lr4fpUPlVtNt8BfGKS/AkIGD+BaLVxArmlbC/6A9/caI ' 33 | 'ldYWwg4yIu+CLjkDtif2NUwsqgfbgfbj/pbhPZGVzC+KjCP382OmQHKM4HoO ' 34 | 'tnJueIoPIb14EwBSz98qmJ6tMBJa2BxsSVbKV076QBE7qNiJF6ZFBimSwob8 ' 35 | 'upM="}') 36 | 37 | get_page = requests.post(url, data=post_dict, headers=headers, verify=False) 38 | 39 | page = get_page.json() -------------------------------------------------------------------------------- /crawler_sys/site_crawler/toutiao_get_signature.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/4/10 12:05 3 | # @Author : litao 4 | import time 5 | import hashlib 6 | # import execjs 7 | 8 | # def get_js(id): 9 | # # f = open(r"D:\work_file\test\index.js", 'r', encoding='UTF-8') 10 | # f = open(r"D:\work_file\work_file\crawler\crawler_sys\site_crawler\toutiao_test_js.js", 'r', encoding='UTF-8') 11 | # line = f.readline() 12 | # htmlstr = '' 13 | # while line: 14 | # htmlstr = htmlstr + line 15 | # line = f.readline() 16 | # ctx = execjs.compile(str(htmlstr)) 17 | # return ctx.call('get_as_cp_signature',id) 18 | 19 | def getHoney(): # 根据JS脚本破解as ,cp 20 | t = int(time.time()) # 获取当前时间 21 | # t=1534389637 22 | # print(t) 23 | e = str('%X' % t) # 格式化时间 24 | # print(e) 25 | m1 = hashlib.md5() # MD5加密 26 | m1.update(str(t).encode(encoding='utf-8')) # 转化格式 27 | i = str(m1.hexdigest()).upper() # 转化大写 28 | # print(i) 29 | n = i[0:5] # 获取前5位字符 30 | a = i[-5:] # 获取后5位字符 31 | s = '' 32 | r = '' 33 | for x in range(0, 5): # 交叉组合字符 34 | s += n[x] + e[x] 35 | r += e[x + 3] + a[x] 36 | eas = 'A1' + s + e[-3:] 37 | ecp = e[0:3] + r + 'E1' 38 | # print(eas) 39 | # print(ecp) 40 | return eas, ecp 41 | 42 | 43 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler_by_redis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/6/20 13:52 3 | # @Author : litao 4 | -------------------------------------------------------------------------------- /crawler_sys/site_crawler_by_redis/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/site_crawler_by_redis/chromedriver -------------------------------------------------------------------------------- /crawler_sys/site_crawler_by_redis/toutiao_article.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/15 15:11 3 | # @Author : litao 4 | from crawler.crawler_sys.site_crawler_by_redis.crawler_toutiao import Crawler_toutiao 5 | 6 | 7 | class Crawler_toutiao_article(Crawler_toutiao): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def releaser_page_by_time(self, start_time=None, end_time=None, url=None, allow=None, **kwargs): 12 | count_false = 0 13 | for res in self.article_page(url, proxies_num=kwargs.get("proxies_num")): 14 | video_time = res["release_time"] 15 | # print(res) 16 | if video_time: 17 | if start_time < video_time: 18 | if video_time < end_time: 19 | yield res 20 | else: 21 | count_false += 1 22 | if count_false > allow: 23 | break 24 | else: 25 | yield res 26 | count_false = 0 27 | for res in self.microheadlines_page(url, proxies_num=kwargs.get("proxies_num")): 28 | video_time = res["release_time"] 29 | print(video_time) 30 | if video_time: 31 | if start_time < video_time: 32 | if video_time < end_time: 33 | yield res 34 | else: 35 | count_false += 1 36 | if count_false > allow: 37 | break 38 | else: 39 | yield res 40 | 41 | if __name__ == "__main__": 42 | test = Crawler_toutiao_article() 43 | for a in test.article_page("https://www.toutiao.com/c/user/5821222208/#mid=5821222208"): 44 | print(a) -------------------------------------------------------------------------------- /crawler_sys/site_crawler_test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/6/20 13:52 3 | # @Author : litao 4 | -------------------------------------------------------------------------------- /crawler_sys/special_propose/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 15 13:59:43 2018 4 | 5 | @author: hanye 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /crawler_sys/special_propose/crawler.v_qq_comment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 27 15:09:29 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import urllib.request 9 | 10 | def get_comment(last): 11 | comment_lst=[] 12 | while (type(last)!=bool): 13 | url = "http://coral.qq.com/article/"+str(targetid)+"/comment/v2?callback=_article"+str(targetid)+"commentv2&oriorder=o&pageflag=1&cursor="+str(last) 14 | headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') 15 | opener = urllib.request.build_opener() 16 | opener.addheaders = [headers] 17 | data = opener.open(url).read() 18 | page=str(data,encoding='utf-8') 19 | useful_infor=page.split('commentv2')[1] 20 | str_to_dic=useful_infor[1:-1] 21 | str_to_dic=str_to_dic.replace('true','True') 22 | str_to_dic=str_to_dic.replace('false','False') 23 | dic_all_infor=eval(str_to_dic) 24 | ttt=dic_all_infor['data'] 25 | last=ttt['last'] 26 | print(last) 27 | repcomment=ttt['repCommList'] 28 | if type(repcomment)==list: 29 | repcomment_lst=repcomment 30 | print('repcomment') 31 | elif type(repcomment)==dict: 32 | list(repcomment.values()) 33 | print('repcomment') 34 | else: 35 | repcomment=None 36 | if repcomment!=None: 37 | for yyy in repcomment_lst: 38 | if type(yyy)==list: 39 | for uu in yyy: 40 | content=uu['content'] 41 | parent=uu['parent'] 42 | publishdate=uu['time'] 43 | userid=uu['userid'] 44 | upcount=uu['up'] 45 | contentid=uu['id'] 46 | dadorson=2 47 | D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson} 48 | comment_lst.append(D2) 49 | else: 50 | content=uu['content'] 51 | parent=uu['parent'] 52 | publishdate=uu['time'] 53 | userid=uu['userid'] 54 | upcount=uu['up'] 55 | contentid=uu['id'] 56 | dadorson=2 57 | D2={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'parent':parent,'contentid':contentid,'dadorson':dadorson} 58 | comment_lst.append(D2) 59 | else: 60 | print('no repcomment') 61 | comment=ttt['oriCommList'] 62 | for zzz in comment: 63 | content=zzz['content'] 64 | contentid=zzz['id'] 65 | upcount=zzz['up'] 66 | publishdate=zzz['time'] 67 | userid=zzz['userid'] 68 | parent=contentid 69 | dadorson=1 70 | D0={'content':content,'userid':userid,'upcount':upcount,'publishdate':publishdate,'contentid':contentid,'parent':parent,'dadorson':dadorson} 71 | comment_lst.append(D0) 72 | print('get all comment') 73 | return comment_lst 74 | 75 | if __name__=='__main__': 76 | last=0 77 | targetid=2426229062 78 | comment=get_comment(last) -------------------------------------------------------------------------------- /crawler_sys/special_propose/crawler.v_qq_danmu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 26 13:50:00 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import requests 9 | 10 | 11 | def danmu(x,jyid,targetid): 12 | danmu_lst=[] 13 | pagenum_lst=[] 14 | for i in range(0,x): 15 | ttt=15*(1+i) 16 | pagenum_lst.append(ttt) 17 | for pagenum in pagenum_lst: 18 | try: 19 | url='https://mfm.video.qq.com/danmu?otype=json&callback=jQuery'+jyid+'×tamp='+str(pagenum)+'&target_id='+targetid+'&count=500&second_count=6&session_key=0%2C0%2C0' 20 | get_page=requests.get(url) 21 | get_page.encoding='utf-8' 22 | page=get_page.text 23 | length=len(jyid)+7 24 | prepage=page[length:-1] 25 | prepage=prepage.replace('\r','') 26 | prepage=prepage.replace('/n','') 27 | dicdicdic=eval(prepage) 28 | danmu_count=dicdicdic['count'] 29 | if danmu_count>1000: 30 | print(danmu_count) 31 | print(pagenum) 32 | print('get one page') 33 | get_danmu_lst=dicdicdic['comments'] 34 | for danmu in get_danmu_lst: 35 | commentid=danmu['commentid'] 36 | content=danmu['content'] 37 | timepoint=danmu['timepoint'] 38 | upcount=danmu['upcount'] 39 | opername=danmu['opername'] 40 | D0={'commentid':commentid,'content':content,'timepoint':timepoint,'upcount':upcount,'opername':opername} 41 | danmu_lst.append(D0) 42 | except SyntaxError: 43 | print(str(pagenum)+'there is sth wrong') 44 | return danmu_lst 45 | 46 | if __name__=='__main__': 47 | x=int((1*3600+31*60+28)/15) 48 | jyid='19103025125001255282_1522399545358' 49 | targetid='2434347230' 50 | video_data777=danmu(x,jyid,targetid) 51 | #one 52 | #1 33 13 53 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104770781474841782_1522044823510×tamp=15&target_id=2431410170&count=80&second_count=6&session_key=0%2C0%2C0&_=1522044823541 54 | 55 | #two 56 | #1 31 28 57 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910006379066561103097_1522048580301×tamp=15&target_id=2432862868&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048580319 58 | 59 | #three 60 | #1 31 28 61 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19104977942731832877_1522048936005×tamp=45&target_id=2434347230&count=80&second_count=6&session_key=178328%2C326%2C1522048940&_=1522048936017 62 | 63 | #four 64 | #1 31 12 65 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191007914957214696439_1522048988303×tamp=15&target_id=2464055709&count=80&second_count=6&session_key=0%2C0%2C0&_=1522048988318 66 | 67 | #five 68 | #1 34 47 69 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19109570751887462264_1522049153524×tamp=45&target_id=2479936974&count=80&second_count=6&session_key=95212%2C150%2C1522049156&_=1522049153539 70 | 71 | #six 72 | #1 31 27 73 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910025632186610303198_1522050281547×tamp=135&target_id=2497027899&count=80&second_count=6&session_key=135654%2C180%2C1522050285&_=1522050281568 74 | 75 | #seven 76 | #1 31 22 77 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery191022889623011170301_1522050238126×tamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522050238141 78 | 79 | 80 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19105157512767429676_1522391911686×tamp=15&target_id=2515637880&count=80&second_count=6&session_key=0%2C0%2C0&_=1522391911700 81 | 82 | #https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19103025125001255282_1522399545358×tamp=105&target_id=2434347230&count=80&second_count=6&session_key=186386%2C332%2C1522399582&_=1522399545371 83 | -------------------------------------------------------------------------------- /crawler_sys/special_propose/eastnews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 13 15:21:47 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import requests 9 | from bs4 import BeautifulSoup 10 | import re 11 | import datetime 12 | import pickle 13 | import pandas as pd 14 | 15 | class Crawler_v_qq_eastnews: 16 | 17 | def video_page(self, url): 18 | get_page=requests.get(url) 19 | get_page.encoding='utf-8' 20 | page = get_page.text 21 | soup = BeautifulSoup(page,'html.parser') 22 | try: 23 | title=soup.find('h1',{'class':'video_title _video_title'}).text 24 | title=title.replace('\n','') 25 | title=title.replace('\t','') 26 | except AttributeError: 27 | title=None 28 | try: 29 | releaser=soup.find('span',{'class':'user_name'}).text 30 | except: 31 | releaser=None 32 | try: 33 | releaserUrl=soup.find('a',{'class':'user_info'})['href'] 34 | except TypeError: 35 | releaserUrl=None 36 | try: 37 | video_intro=soup.find('meta',{'itemprop':'description'})['content'] 38 | except TypeError: 39 | video_intro=None 40 | try: 41 | midstep = soup.find("script",{"r-notemplate":"true"}).text 42 | try: 43 | duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1] 44 | except IndexError: 45 | duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1] 46 | duration=duration.replace('"','') 47 | duration=int(duration) 48 | except: 49 | print('Catched exception, didn\'t find duartion in var VIDEO_INFO') 50 | duration=0 51 | try: 52 | playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1] 53 | except: 54 | print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO') 55 | playcount=0 56 | retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0] 57 | try: 58 | release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3) 59 | except ValueError: 60 | release_time=0 61 | except: 62 | duration=None 63 | playcount=None 64 | release_time=None 65 | D0={'title':title,'playcount':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl} 66 | return D0 67 | 68 | def search_page(self,totalpage): 69 | video_Lst=[] 70 | url_Lst=[] 71 | page_Lst=['https://v.qq.com/x/search/?ses=qid%3D_5hveCy5oWKS_b5d4GuLquXTO29F8LJnLcmNDpNkXFkeEr8UDB0g9g%26last_query%3D%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB%26tabid_list%3D0%7C11%7C8%7C7%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%96%B0%E9%97%BB%7C%E5%8E%9F%E5%88%9B%7C%E5%85%B6%E4%BB%96&q=%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB&stag=3&cur={}&cxt=tabid%3D0%26sort%3D1%26pubfilter%3D0%26duration%3D3'.format(str(i)) for i in range(1,totalpage)] 72 | for page_url in page_Lst: 73 | get_page=requests.get(page_url) 74 | print (page_url) 75 | get_page.encoding='utf-8' 76 | page = get_page.text 77 | soup = BeautifulSoup(page,'html.parser') 78 | tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" }) 79 | for data_line in tencent: 80 | try: 81 | ttt=data_line.find('span',{'title':'东方新闻'}).text 82 | except AttributeError: 83 | ttt=None 84 | if ttt==None: 85 | urls=None 86 | else: 87 | urls=data_line.h2.a['href'] 88 | get_page=requests.get(urls) 89 | print (urls) 90 | get_page.encoding='utf-8' 91 | page = get_page.text 92 | soup = BeautifulSoup(page,'html.parser') 93 | fff=soup.find_all('a',{'class':'figure_detail'}) 94 | for zzz in fff: 95 | urls1=zzz['href'] 96 | urls2='https://v.qq.com'+urls1 97 | url_Lst.append(urls2) 98 | for url in url_Lst: 99 | dicdic = self.video_page(url) 100 | dicdic['url']=url 101 | print(url) 102 | video_Lst.append(dicdic) 103 | return video_Lst 104 | 105 | 106 | if __name__=='__main__': 107 | v_qq_crawler = Crawler_v_qq_eastnews() 108 | search_page2=v_qq_crawler.search_page(totalpage=20) -------------------------------------------------------------------------------- /crawler_sys/special_propose/get_weibo_user_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 31 09:20:25 2018 4 | 5 | 龟速爬取 需要提速 6 | 7 | @author: fangyucheng 8 | """ 9 | 10 | 11 | from selenium import webdriver 12 | import re 13 | import json 14 | import time 15 | 16 | 17 | 18 | class Crawler_Get_Weibo_User_Info(): 19 | 20 | 21 | def from_file_to_list(self,filename): 22 | openfile = open(filename) 23 | task = [] 24 | for line in openfile: 25 | line_dic = line.replace('\n','') 26 | task.append(line_dic) 27 | return task 28 | 29 | 30 | 31 | def get_user_info(self,filename,resultname): 32 | result = open(resultname,'a') 33 | result_lst = [] 34 | task_lst = self.from_file_to_list(filename) 35 | browser = webdriver.Chrome() 36 | for url in task_lst: 37 | try: 38 | browser.get(url) 39 | browser.maximize_window() 40 | time.sleep(8) 41 | user_name = browser.find_element_by_class_name('username').text 42 | print('get user_name') 43 | user_info = browser.find_element_by_class_name('info').text 44 | print('get user_info') 45 | try: 46 | industry_detail_intro = browser.find_element_by_class_name('ul_detail').text 47 | pattern1 = '行业类别' 48 | pattern2 = '简介' 49 | pattern3 = '毕业于' 50 | try: 51 | industry = ' '.join(re.findall('行业类别.*',industry_detail_intro)).replace(pattern1,'').replace(' ','') 52 | print('get industry') 53 | except: 54 | industry = None 55 | try: 56 | detail_intro = ' '.join(re.findall('简介.*',industry_detail_intro)).replace(pattern2,'').replace(' ','').replace(':','') 57 | print('get detail_intro') 58 | except: 59 | detail_intro = None 60 | try: 61 | graduated_from = ' '.join(re.findall('毕业于.*',industry_detail_intro)).replace(pattern3,'').replace(' ','') 62 | print('get graduated_from') 63 | except: 64 | graduated_from = None 65 | except: 66 | pass 67 | followers_fans = browser.find_element_by_class_name('tb_counter').text.split('\n') 68 | followers = followers_fans[0] 69 | print('get followers') 70 | fans = followers_fans[2] 71 | print('get fans') 72 | weibo_num = followers_fans[4] 73 | print('get weibo_num') 74 | D0 = {'user_name':user_name, 75 | 'user_info':user_info, 76 | 'industry':industry, 77 | 'detail_intro':detail_intro, 78 | 'followers':followers, 79 | 'fans':fans, 80 | 'weibo_num':weibo_num, 81 | 'url':url, 82 | 'graduated_from':graduated_from} 83 | print('get one user') 84 | result_lst.append(D0) 85 | json_D0 = json.dumps(D0) 86 | result.write(json_D0) 87 | result.write('\n') 88 | result.flush() 89 | except: 90 | pass 91 | return result_lst 92 | 93 | 94 | 95 | if __name__=='__main__': 96 | test = Crawler_Get_Weibo_User_Info() 97 | filename='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_list_try.txt' 98 | resultname='D:\CSM3.0\爬虫结果\weibo_user/weibo_user_info_20180531_2' 99 | firstV3 = test.get_user_info(filename,resultname) -------------------------------------------------------------------------------- /crawler_sys/special_propose/tocsv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 28 01:12:09 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | 9 | import pandas as pd 10 | ttt=pd.DataFrame(comment) 11 | ttt['heiheihei']="'" 12 | ttt['id']=ttt['heiheihei']+ttt['contentid'] 13 | ttt['real_time']=pd.to_datetime(ttt['publishdate'],unit='s') 14 | try: 15 | ttt.to_csv('wuwuwu.csv',encoding='utf-8',index=False) 16 | except UnicodeEncodeError: 17 | pass 18 | -------------------------------------------------------------------------------- /crawler_sys/special_propose/whether_user_exists.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 15 09:49:15 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | from selenium import webdriver 9 | from crawler_sys.utils.Metaorphosis import Meta 10 | import time 11 | import json 12 | 13 | 14 | def open_url(filename,resultname): 15 | result_file = open(resultname,'a') 16 | result_lst = [] 17 | browser = webdriver.Chrome() 18 | browser.maximize_window() 19 | time.sleep(60) 20 | meta = Meta() 21 | url_lst = meta.str_file_to_lst(filename) 22 | for url in url_lst: 23 | browser.get(url) 24 | time.sleep(6) 25 | try: 26 | user_name = browser.find_element_by_class_name('username').text 27 | except: 28 | user_name = None 29 | D0 = {'url':url,'user_name':user_name} 30 | json_D0 = json.dumps(D0) 31 | result_file.write(json_D0) 32 | result_file.write('\n') 33 | result_file.flush() 34 | result_lst.append(D0) 35 | return result_lst 36 | 37 | 38 | if __name__=='__main__': 39 | filename = 'D:/CSM3.0/打杂/whether_exists.txt' 40 | resultname = 'D:/CSM3.0/打杂/whether_exists' 41 | result=open_url(filename,resultname) 42 | -------------------------------------------------------------------------------- /crawler_sys/tools/add_releasers_video_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Sep 6 09:22:24 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 10 | from crawler.crawler_sys.utils import trans_format 11 | 12 | def get_target_releaser_video_info(file_name, 13 | output_to_es_raw=True, 14 | es_index=None, 15 | doc_type=None, 16 | releaser_page_num_max=10000): 17 | start_time = int(time.time()*1e3) 18 | task_lst = trans_format.csv_to_lst_with_headline(file_name) 19 | for line in task_lst: 20 | releaserUrl = line['releaserUrl'] 21 | platform = line['platform'] 22 | crawler = get_crawler(platform=platform) 23 | crawler_initialization = crawler() 24 | if platform == 'haokan': 25 | try: 26 | crawler_initialization.releaser_page(releaserUrl=releaserUrl, 27 | releaser_page_num_max=releaser_page_num_max, 28 | output_to_es_raw=True, 29 | es_index=es_index, 30 | doc_type=doc_type, 31 | fetchFavoriteCommnt=False) 32 | except: 33 | print(releaserUrl) 34 | else: 35 | try: 36 | crawler_initialization.releaser_page(releaserUrl=releaserUrl, 37 | releaser_page_num_max=releaser_page_num_max, 38 | output_to_es_raw=True, 39 | es_index=es_index, 40 | doc_type=doc_type) 41 | except: 42 | print(releaserUrl) 43 | end_time = int(time.time()*1e3) 44 | time_info = [start_time, end_time] 45 | return time_info 46 | 47 | if __name__ =='__main__': 48 | get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv', 49 | releaser_page_num_max=1000, 50 | es_index='crawler-data-raw', 51 | doc_type='doc' 52 | ) -------------------------------------------------------------------------------- /crawler_sys/tools/add_releasers_video_data_multi_process.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Sep 6 09:22:24 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | from multiprocessing import Pool 10 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 11 | from crawler.crawler_sys.utils import trans_format 12 | from elasticsearch import Elasticsearch 13 | from elasticsearch.helpers import scan 14 | 15 | hosts = '192.168.17.11' 16 | port = 80 17 | user = 'zhouyujiang' 18 | passwd = '8tM9JDN2LVxM' 19 | http_auth = (user, passwd) 20 | 21 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 22 | 23 | def func_search_reUrl_from_target_index(platform, releaser): 24 | search_body = { 25 | "query": { 26 | "bool": { 27 | "filter": [ 28 | {"term": {"platform.keyword": platform}}, 29 | {"term": {"releaser.keyword": releaser}} 30 | ] 31 | } 32 | } 33 | } 34 | search_re = es.search(index='target_releasers', doc_type='doc', body=search_body) 35 | if search_re['hits']['total'] > 0: 36 | return search_re['hits']['hits'][0]['_source']['releaserUrl'] 37 | else: 38 | print('Can not found:', platform, releaser) 39 | return None 40 | 41 | def get_target_releaser_video_info(file_name, 42 | output_to_es_raw=True, 43 | es_index=None, 44 | doc_type=None, 45 | releaser_page_num_max=10000): 46 | start_time = int(time.time()*1e3) 47 | task_lst = trans_format.csv_to_lst_with_headline(file_name) 48 | pool = Pool(10) 49 | arg_dict = {"releaser_page_num_max": releaser_page_num_max, 50 | "output_to_es_raw": True, 51 | "es_index": es_index, 52 | "doc_type": doc_type} 53 | for line in task_lst: 54 | platform = line['platform'] 55 | releaser = line['releaser'] 56 | try: 57 | releaserUrl = line["releaserUrl"] 58 | except: 59 | releaserUrl = func_search_reUrl_from_target_index(platform, releaser) 60 | print("releaserUrl",releaserUrl) 61 | crawler_initialization = get_crawler(platform=platform) 62 | try: 63 | crawler = crawler_initialization().search_page 64 | pool.apply_async(crawler, args=(releaserUrl, ), kwds=arg_dict) 65 | except: 66 | continue 67 | pool.close() 68 | pool.join() 69 | end_time = int(time.time()*1e3) 70 | time_info = [start_time, end_time] 71 | return time_info 72 | 73 | if __name__ =='__main__': 74 | get_time = get_target_releaser_video_info(file_name=r'C:\Users\litao\Desktop\target_releasers - key_custom.csv', 75 | releaser_page_num_max=300, 76 | es_index='crawler-data-raw', 77 | doc_type='doc') -------------------------------------------------------------------------------- /crawler_sys/tools/add_search_keywords_to_es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/7/19 11:29 3 | # @Author : litao 4 | 5 | import json 6 | import datetime 7 | import elasticsearch 8 | import hashlib 9 | import csv 10 | hosts = '192.168.17.11' 11 | port = 80 12 | user = 'zhouyujiang' 13 | passwd = '8tM9JDN2LVxM' 14 | http_auth = (user, passwd) 15 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 16 | 17 | 18 | def write_es(Lst): 19 | count = 0 20 | bulk_all_body = "" 21 | doc_id_type = "all-time-url" 22 | header_Lst = Lst[0] 23 | linec = 1 24 | sha1 = hashlib.sha1() 25 | for line in Lst: 26 | if linec == 1: 27 | linec += 1 28 | continue 29 | linec += 1 30 | print(linec) 31 | line_dict = dict(zip(header_Lst, line)) 32 | dic = { 33 | "title": line_dict["title"], 34 | "timestamp": int(datetime.datetime.now().timestamp() * 1e3), 35 | "platform": line_dict["platform"], 36 | "page": line_dict["page"], 37 | } 38 | 39 | sha1.update((line_dict["title"]+line_dict["platform"]).encode("utf8")) 40 | bulk_head = '{"index": {"_id":"%s"}}' % sha1.hexdigest() 41 | data_str = json.dumps(dic, ensure_ascii=False) 42 | bulk_one_body = bulk_head + '\n' + data_str + '\n' 43 | bulk_all_body += bulk_one_body 44 | count += 1 45 | if count % 500 == 0: 46 | eror_dic = es.bulk(index=target_index, doc_type=target_type, 47 | body=bulk_all_body, request_timeout=500) 48 | bulk_all_body = '' 49 | if eror_dic['errors'] is True: 50 | print(eror_dic['items']) 51 | print(bulk_all_body) 52 | print(count) 53 | 54 | if bulk_all_body != '': 55 | eror_dic = es.bulk(body=bulk_all_body, 56 | index=target_index, 57 | doc_type=target_type, 58 | request_timeout=500) 59 | if eror_dic['errors'] is True: 60 | print(eror_dic) 61 | bulk_all_body = '' 62 | # print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url)) 63 | 64 | 65 | if __name__ == '__main__': 66 | target_index = 'search_keywords' 67 | target_type = 'doc' 68 | 69 | m3 = open(r"D:\work_file\发布者账号\一次性需求附件\keywords.csv", "r", encoding="gb18030") 70 | file = csv.reader(m3) 71 | data = list(file) 72 | write_es(data) 73 | -------------------------------------------------------------------------------- /crawler_sys/tools/add_target_releasers_by_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 13 11:57:40 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import elasticsearch 9 | import json 10 | import time 11 | from crawler_sys.utils.releaser_url_check import test_releaserUrl 12 | from crawler_sys.utils import trans_format 13 | 14 | 15 | hosts = '192.168.17.11' 16 | port = 80 17 | user_id = 'fangyucheng' 18 | password = 'VK0FkWf1fV8f' 19 | http_auth = (user_id, password) 20 | lose_re_url = [] 21 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 22 | 23 | test_lst = trans_format.csv_to_lst_with_headline('F:/add_target_releaser/album_playcnt/album_playcnt_002.csv') 24 | task_lst = [] 25 | 26 | for line in test_lst: 27 | if line['releaserUrl'] is not None: 28 | task_lst.append(line) 29 | 30 | bulk_all_body = '' 31 | 32 | poster = 'fangyucheng' 33 | test_re = test_releaserUrl(task_lst) 34 | 35 | for one_re in test_re: 36 | if one_re['True_or_False'] == 1: 37 | line_dic = {} 38 | post_by = poster 39 | post_time = int(time.time() * 1000) 40 | timestamp = int(time.time() * 1000) 41 | releaserUrl = one_re['releaserUrl'] 42 | platform = one_re['platform'] 43 | releaser = one_re['releaser'] 44 | try: 45 | album_play_count = one_re['album_play_count'] 46 | except: 47 | album_play_count = None 48 | _id = platform + '_' + releaser 49 | 50 | bulk_head = '{"index": {"_id":"%s"}}' % _id 51 | line_dic['is_valid'] = True 52 | line_dic['platform'] = platform 53 | line_dic['post_by'] = post_by 54 | if album_play_count is not None: 55 | line_dic['album_play_count'] = album_play_count 56 | line_dic['post_time'] = post_time 57 | line_dic['releaser'] = releaser 58 | line_dic['releaserUrl'] = releaserUrl 59 | line_dic['timestamp'] = timestamp 60 | data_str=json.dumps(line_dic, ensure_ascii=False) 61 | bulk_one_body = bulk_head + '\n' + data_str + '\n' 62 | bulk_all_body += bulk_one_body 63 | es.bulk(index='target_releasers', doc_type='doc', 64 | body=bulk_all_body, request_timeout=200) 65 | bulk_all_body = '' 66 | print('success') 67 | 68 | -------------------------------------------------------------------------------- /crawler_sys/tools/count_releaser_publish_num_and_refresh_crawler_frequency.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 11 11:59:55 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import json 9 | import time 10 | import elasticsearch 11 | import elasticsearch.helpers 12 | 13 | hosts = '192.168.17.11' 14 | port = 80 15 | user_id = 'fangyucheng' 16 | password = 'VK0FkWf1fV8f' 17 | http_auth = (user_id, password) 18 | 19 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 20 | 21 | task_list = [] 22 | result_list = [] 23 | 24 | es_scan = elasticsearch.helpers.scan(es, index='target_releasers') 25 | 26 | for line in es_scan: 27 | task_list.append(line) 28 | print('the length of releaser is %s' % len(task_list)) 29 | 30 | bulk_all_body = '' 31 | count = 0 32 | for line in task_list: 33 | releaser_info = line['_source'] 34 | platform = releaser_info['platform'] 35 | releaser = releaser_info['releaser'] 36 | search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":platform}}, 37 | {"term":{"releaser.keyword":releaser}}, 38 | {"term":{"data_month":11}}, 39 | {"term":{"data_year":2018}}, 40 | {"term":{"stats_type.keyword":"new_released"}}]}}} 41 | 42 | es_search = es.search(index='releaser', doc_type='releasers', 43 | body=search_body) 44 | if es_search['hits']['total'] != 0: 45 | hits = es_search['hits']['hits'][0]['_source']['video_num'] 46 | releaser_info['Nov_2018'] = int(hits) 47 | print("releaser %s hit %s video in es" % (releaser, hits)) 48 | else: 49 | releaser_info['Nov_2018'] = 0 50 | task_list.remove(line) 51 | total = releaser_info['Nov_2018'] 52 | if total >= 900: 53 | releaser_info['frequency'] = 9 54 | print("%s frequency is 3" % releaser_info['releaser']) 55 | if total >= 300: 56 | releaser_info['frequency'] = 3 57 | print("%s frequency is 3" % releaser_info['releaser']) 58 | count += 1 59 | else: 60 | releaser_info['frequency'] = 1 61 | _id = platform + '_' + releaser 62 | bulk_head = '{"index": {"_id":"%s"}}' % _id 63 | releaser_info['timestamp'] = int(time.time() * 1e3) 64 | data_str = json.dumps(releaser_info, ensure_ascii=False) 65 | bulk_one_body = bulk_head+'\n'+data_str+'\n' 66 | bulk_all_body += bulk_one_body 67 | es.bulk(index='target_releasers', doc_type='doc', 68 | body=bulk_all_body) 69 | bulk_all_body = '' 70 | print('write %s into es' % releaser) 71 | -------------------------------------------------------------------------------- /crawler_sys/tools/crawler_url_video_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 根据 url 抓取 页面的播放量等信息 4 | 5 | @author: zhouyujiang 6 | """ 7 | 8 | import time 9 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 10 | from crawler.crawler_sys.utils import trans_format 11 | 12 | 13 | 14 | def get_target_video_info(platform, url): 15 | crawler = get_crawler(platform=platform) 16 | crawler_initialization = crawler() 17 | new_playcount = crawler_initialization.check_play_count_by_video_page(url=url) 18 | return new_playcount 19 | 20 | 21 | 22 | 23 | 24 | 25 | #if __name__ =='__main__': 26 | # get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv', 27 | # releaser_page_num_max=1000, 28 | # es_index='crawler-data-raw', 29 | # doc_type='doc' 30 | # ) -------------------------------------------------------------------------------- /crawler_sys/tools/error.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/error.log -------------------------------------------------------------------------------- /crawler_sys/tools/find_high_fre_releaser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Oct 23 10:03:53 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import configparser 9 | from elasticsearch import Elasticsearch 10 | 11 | hosts = '192.168.17.11' 12 | port = 80 13 | user_id = 'fangyucheng' 14 | password = 'VK0FkWf1fV8f' 15 | http_auth = (user_id, password) 16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 17 | 18 | search_body = {"query":{"bool":{"filter":[{"term":{"platform.keyword":"haokan"}}]}}, 19 | "sort":[{"Nov_2018":{"order":"desc"}}]} 20 | 21 | es_search = es_connection.search(index='target_releasers', 22 | doc_type='doc', 23 | body=search_body, size=1000) 24 | 25 | es_data_lst = es_search['hits']['hits'] 26 | 27 | result_list = [] 28 | 29 | for line in es_data_lst: 30 | data_dic = line['_source'] 31 | result_list.append(data_dic) 32 | 33 | new_list = result_list[:40] 34 | 35 | result_list = [] 36 | 37 | releaser_dic = {} 38 | for line in new_list: 39 | releaser_dic[line['releaser']] = line['releaserUrl'] 40 | 41 | 42 | config = configparser.ConfigParser() 43 | config['haokan'] = releaser_dic 44 | 45 | with open ('high_fre.ini', 'w', encoding='utf-8') as ini: 46 | config.write(ini) 47 | 48 | 49 | 50 | #special task 51 | #for line in source_lst: 52 | # detail_lst = line['detail'] 53 | # csm_mdu = detail_lst[0]['csm_mdu'] 54 | # for detail_dic in detail_lst: 55 | # detail_dic.pop('csm_mdu') 56 | # line['csm_mdu'] = csm_mdu -------------------------------------------------------------------------------- /crawler_sys/tools/match_play_count_by_video_title.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 23 13:48:33 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | from elasticsearch import Elasticsearch 9 | from crawler.crawler_sys.utils.trans_format import lst_to_csv 10 | 11 | hosts = '192.168.17.11' 12 | port = 80 13 | user_id = 'fangyucheng' 14 | password = 'VK0FkWf1fV8f' 15 | http_auth = (user_id, password) 16 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 17 | 18 | unsolve_lst = [] 19 | result_lst2 =[] 20 | 21 | for line in task_list: 22 | url = line['url'] 23 | title = line['title'] 24 | search_body = {"query": {"bool": {"filter": [{"term": {"title.keyword": title}}]}}} 25 | search = es_connection.search(index="test2", doc_type="fyc1210", body=search_body) 26 | if search["hits"]["total"] == 0: 27 | unsolve_lst.append(url) 28 | print("can not get video data at %s" % url) 29 | else: 30 | video_data = search["hits"]["hits"][0]["_source"] 31 | result_lst2.append(video_data) 32 | print("get playcount at %s" % url) 33 | 34 | lst_to_csv(listname=result_lst2, 35 | csvname="F:/add_target_releaser/Nov/Sep2.csv") -------------------------------------------------------------------------------- /crawler_sys/tools/match_play_count_by_video_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 23 14:44:07 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import elasticsearch.helpers 9 | from elasticsearch import Elasticsearch 10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv 11 | from crawler.crawler_sys.utils.trans_format import str_file_to_lst 12 | from crawler.crawler_sys.utils.trans_format import str_lst_to_file 13 | #from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline 14 | 15 | hosts = '192.168.17.11' 16 | port = 80 17 | user_id = 'fangyucheng' 18 | password = 'VK0FkWf1fV8f' 19 | http_auth = (user_id, password) 20 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 21 | 22 | 23 | def init_task_list(file_path=None): 24 | task_list = [] 25 | if file_path is None: 26 | es_scan = elasticsearch.helpers.scan(es_connection, index='album-play-count') 27 | for line in es_scan: 28 | video_dict = line['_source'] 29 | task_list.append(video_dict) 30 | return task_list 31 | else: 32 | task_list = str_file_to_lst(file_path) 33 | return task_list 34 | 35 | unsolve_lst = [] 36 | result_lst2 =[] 37 | 38 | 39 | task_list = str_file_to_lst('F:/add_target_releaser/album_play_count/dec') 40 | #task_list = init_task_list() 41 | 42 | 43 | for line in task_list: 44 | try: 45 | if type(line) == dict: 46 | url = line['url'] 47 | elif type(line) == str: 48 | url = line 49 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}} 50 | search = es_connection.search(index="test2", doc_type="dec", body=search_body) 51 | if search["hits"]["total"] == 0: 52 | unsolve_lst.append(url) 53 | print("can not get video data at %s" % url) 54 | else: 55 | video_data = search["hits"]["hits"][0]["_source"] 56 | result_lst2.append(video_data) 57 | print("get playcount at %s" % url) 58 | except: 59 | pass 60 | 61 | lst_to_csv(listname=result_lst2, 62 | csvname="F:/add_target_releaser/last_month/fix_play_count12242.csv") 63 | str_lst_to_file(unsolve_lst, 64 | filename="F:/add_target_releaser/last_month/unsolved") 65 | -------------------------------------------------------------------------------- /crawler_sys/tools/ocr_by_aliyun.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/29 15:52 3 | # @Author : litao 4 | 5 | 6 | import os,re 7 | from aliyunsdkcore.client import AcsClient 8 | from aliyunsdkcore.acs_exception.exceptions import ClientException 9 | from aliyunsdkcore.acs_exception.exceptions import ServerException 10 | from aliyunsdkocr.request.v20191230.RecognizeCharacterRequest import RecognizeCharacterRequest 11 | from crawler.crawler_sys.framework.config.oss_keyword import AccessKeyId,AccessKeySecret 12 | import oss2 13 | from viapi.fileutils import FileUtils 14 | 15 | 16 | endpoint = "oss-cn-beijing.aliyuncs.com" 17 | # region = "v-plus-scope.oss-cn-beijing.aliyuncs.com" 18 | region = "v-plus-scope" 19 | oss_url_expries = 3600 20 | access_key_id = AccessKeyId 21 | access_key_secret = AccessKeySecret 22 | bucket_name = "v-plus-scope" 23 | auth = oss2.Auth(access_key_id, access_key_secret) 24 | bucket = oss2.Bucket(auth, endpoint, bucket_name) 25 | client = AcsClient(AccessKeyId, AccessKeySecret, 'cn-shanghai') 26 | 27 | 28 | def put_obj(access_key_id, access_key_secret, region, bucket_name, object_name): 29 | fileobj_content = open("ocr_img/") 30 | # oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR) 31 | auth = oss2.Auth(access_key_id, access_key_secret) 32 | bucket = oss2.Bucket(auth, region, bucket_name) 33 | bucket.put_object(object_name, fileobj_content) 34 | file_utils = FileUtils(AccessKeyId, AccessKeySecret) 35 | oss_url = file_utils.get_oss_url("http://xxx.jpeg", "jpg", False) 36 | 37 | def ocr_from_aliyun(file_name=""): 38 | request = RecognizeCharacterRequest() 39 | request.set_accept_format('json') 40 | path_name = "ocr_img/%s"%file_name 41 | put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name,) 42 | request.set_ImageURL("http://explorer-image.oss-cn-shanghai.aliyuncs.com/270450672578492833/2020-05-01+200210.png?OSSAccessKeyId=LTAI4Fk9FstqSEYnqKJ5Dpeo&Expires=1590740750&Signature=ZggX6U2%2F3WvpSUpR9P8EYrD0vbQ%3D") 43 | request.set_MinHeight(15) 44 | request.set_OutputProbability(True) 45 | 46 | response = client.do_action_with_exception(request) 47 | # python2: print(response) 48 | print(str(response, encoding='utf-8')) 49 | 50 | 51 | def file_path_scan(file_path): 52 | for filename in os.listdir(file_path): 53 | path = os.path.join(file_path, filename) 54 | if not os.path.isfile(path): 55 | continue 56 | title = img_to_str(path, lang=Languages.CHS) 57 | print(title) 58 | try: 59 | play_count = re.findall("\d+",title)[0] 60 | #print(play_count) 61 | except: 62 | #print(title) 63 | play_count= 0 64 | yield filename,play_count 65 | 66 | 67 | file_path = r'D:\work_file\word_file_new\litao\num' 68 | for filename,play_count in file_path_scan(file_path): 69 | time_str = filename.replace(".png","") 70 | time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:] 71 | # print(time_str) 72 | print(time_str,play_count) -------------------------------------------------------------------------------- /crawler_sys/tools/ocr_by_img.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/29 10:11 3 | # @Author : litao 4 | 5 | from PIL import Image 6 | import pytesseract,os,re 7 | import cv2 8 | import argparse 9 | import cv2 10 | import os 11 | # construct the argument parse and parse the arguments 12 | ap = argparse.ArgumentParser() 13 | ap.add_argument("-p", "--preprocess", type=str, default="thresh", 14 | help="type of preprocessing to be done") 15 | args = vars(ap.parse_args()) 16 | class Languages: 17 | CHS = 'chi_sim' 18 | ENG = 'eng' 19 | 20 | def img_to_str(image_path, lang=Languages.CHS): 21 | # img = Image.open(image_path) 22 | # width, height = img.size 23 | # img.show() 24 | # mode = img.mode 25 | 26 | # print(img.size) 27 | # thumb = img.crop((10,42,160,150)) 28 | # img.grab(0,0,250,200) 29 | # thumb.save("thumb.jpg") 30 | # image = cv2.imread(image_path) 31 | # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 32 | # # check to see if we should apply thresholding to preprocess the 33 | # # image 34 | # if args["preprocess"] == "thresh": 35 | # gray = cv2.threshold(gray, 0, 255, 36 | # cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] 37 | # # make a check to see if median blurring should be done to remove 38 | # # noise 39 | # elif args["preprocess"] == "blur": 40 | # gray = cv2.medianBlur(gray, 3) 41 | # # write the grayscale image to disk as a temporary file so we can 42 | # # apply OCR to it 43 | # filename = "thumb.png" 44 | # cv2.imwrite(filename, gray) 45 | # thumb = img.crop((40, 30, 100, 70)) 46 | #img.grab((30, 30, 150, 80)) 47 | # thumb.save("thumb.jpg") 48 | # ,config="-psm 7 digits" 49 | img = Image.open(image_path) 50 | # thumb = img.crop((10,42,160,150)) 51 | # thumb = img.crop((40, 30, 100, 70)) 52 | thumb = img.crop((490, 0, 560, 60)) 53 | thumb.save("thumb.jpg") 54 | return pytesseract.image_to_string(thumb, lang,config="-psm 7 digits") 55 | 56 | def file_path_scan(file_path): 57 | for filename in os.listdir(file_path): 58 | path = os.path.join(file_path, filename) 59 | if not os.path.isfile(path): 60 | continue 61 | title = img_to_str(path, lang=Languages.CHS) 62 | print(title) 63 | try: 64 | play_count = re.findall("\d+",title)[0] 65 | #print(play_count) 66 | except: 67 | #print(title) 68 | play_count= 0 69 | yield filename,play_count 70 | 71 | 72 | file_path = r'D:\work_file\word_file_new\litao\num' 73 | for filename,play_count in file_path_scan(file_path): 74 | time_str = filename.replace(".png","") 75 | time_str = time_str[0:13] +":"+ time_str[13:15]+":"+ time_str[15:] 76 | # print(time_str) 77 | print(time_str,play_count) 78 | 79 | # print(img_to_str(r'D:\work_file\word_file_new\litao\screen\2020-04-16 202632.png', lang=Languages.CHS)) -------------------------------------------------------------------------------- /crawler_sys/tools/tudou_selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import datetime 3 | import re 4 | import pandas as pd 5 | 6 | 7 | class Craler_tudou(object): 8 | def __init__(self): 9 | chrome_options = webdriver.ChromeOptions() 10 | prefs = {"profile.managed_default_content_settings.images": 2} 11 | chrome_options.add_experimental_option("prefs", prefs) 12 | self.driver = webdriver.Chrome(chrome_options=chrome_options) 13 | 14 | 15 | @staticmethod 16 | def video_time(time_str): 17 | now = datetime.datetime.now() 18 | if "分钟前" in time_str: 19 | min_str = re.findall(r"(\d+)分钟前",time_str)[0] 20 | videotime = now - datetime.timedelta(minutes=int(min_str)) 21 | elif "小时前" in time_str: 22 | hour_str = re.findall(r"(\d+)小时前", time_str)[0] 23 | videotime = now - datetime.timedelta(hours=int(hour_str)) 24 | elif "昨天" in time_str: 25 | date_lis = time_str.split(" ") 26 | hours, mins = date_lis[1].split(":") 27 | last_day = now - datetime.timedelta(days=1) 28 | videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins)) 29 | elif "前天" in time_str: 30 | date_lis = time_str.split(" ") 31 | hours, mins = date_lis[1].split(":") 32 | last_day = now - datetime.timedelta(days=2) 33 | videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins)) 34 | elif "天前" in time_str: 35 | day_str = re.findall(r"(\d+)天前", time_str)[0] 36 | videotime = now - datetime.timedelta(days=int(day_str)) 37 | elif "刚刚" in time_str: 38 | videotime = now 39 | else: 40 | if str(now.year) in time_str: 41 | pass 42 | else: 43 | date_lis = time_str.split(" ") 44 | month,days = date_lis[0].split("-") 45 | hours,mins = date_lis[1].split(":") 46 | videotime = datetime.datetime(year=int(now.year),month=int(month),day=int(days),hour=int(hours),minute=int(mins)) 47 | 48 | # print(videotime.strftime("%Y-%m-%d %H:%M:%S")) 49 | return videotime 50 | 51 | def time_range_video_num(self,start_time,end_time,url_list): 52 | data_lis = [] 53 | info_lis = [] 54 | columns = [""] 55 | for dic in url_list: 56 | for res in self.get_page(dic["url"]): 57 | title,link,video_time = res 58 | print(res) 59 | if start_time < video_time < end_time: 60 | data_lis.append((title,link,video_time,dic["url"])) 61 | else: 62 | break 63 | csv_save = pd.DataFrame(data_lis) 64 | csv_save.to_csv("%s.csv" % (dic["platform"] + "_" + dic["releaser"]),encoding="GBK") 65 | info_lis.append([dic["platform"],dic["releaser"],len(data_lis)]) 66 | data_lis = [] 67 | csv_save = pd.DataFrame(info_lis) 68 | csv_save.to_csv("%s.csv" % (datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")), encoding="GBK") 69 | self.driver.quit() 70 | 71 | def get_page(self,url): 72 | #video_page = self.driver.get(url) 73 | # js = 'window.open("%s");' % url 74 | # self.driver.execute_script(js) 75 | page_num = -1 76 | try: 77 | video_page = self.driver.get(url) 78 | while True: 79 | page_num += 1 80 | if page_num != 0: 81 | self.driver.find_element_by_class_name("next").click() 82 | video_lis = self.driver.find_elements_by_xpath("/html/body/div[2]/div/div[3]/div/div/div/div[2]/div/div/div/div[1]/div") 83 | for v in video_lis: 84 | v_a = v.find_element_by_xpath("./div[2]/a") 85 | title = v_a.get_attribute("title") 86 | link = v_a.get_attribute("href") 87 | video_time = self.video_time(v.find_element_by_class_name("v-publishtime").text) 88 | 89 | yield (title,link,video_time) 90 | 91 | except Exception as e: 92 | raise e 93 | print(e) 94 | print("page %s has no more data" % page_num) 95 | 96 | if __name__ == "__main__": 97 | test = Craler_tudou() 98 | url_lis = [ 99 | {"platform":"new_tudou", 100 | "url":"https://id.tudou.com/i/UNTk2NjE0MDM4NA==/videos?", 101 | "releaser":"酷娱文化先锋" 102 | }, 103 | {"platform": "new_tudou", 104 | "url": "https://id.tudou.com/i/UMTQ3MDM0MjAw/videos?", 105 | "releaser": "酷娱文化先锋" 106 | }] 107 | start_time = datetime.datetime(year=2019,month=6,day=6) 108 | end = datetime.datetime.now() 109 | test.time_range_video_num(start_time,end,url_lis) -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count.rar -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/.idea/video_num_count.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/2019-09-10.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/2019-09-10.csv -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/count.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/count.csv -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/count_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/6/11 15:26 3 | # @Author : litao 4 | import datetime 5 | 6 | from crawler.haokan_count import * 7 | from crawler.qq_video_count import * 8 | from crawler.toutiao_count import * 9 | from crawler.tudou_count import * 10 | from crawler.crawler_wangyi_news import * 11 | from crawler.crawler_tencent_news import * 12 | from concurrent.futures import ProcessPoolExecutor 13 | 14 | craler_site = { 15 | "haokan": Craler_haokan(), 16 | "toutiao": Craler_toutiao(), 17 | "腾讯视频": Craler_qq(), 18 | "new_tudou": Craler_tudou(), 19 | "网易新闻": Crawler_wangyi_news(), 20 | "腾讯新闻": Crawler_Tencent_News() 21 | } 22 | 23 | 24 | def start_count(releaser, platform, releaserUrl, re_s_t, re_e_t): 25 | craler = craler_site.get(platform) 26 | if craler: 27 | url_lis = [ 28 | { 29 | "platform": platform, 30 | "url": releaserUrl, 31 | "releaser": releaser 32 | } 33 | ] 34 | craler.time_range_video_num(re_s_t, re_e_t, url_lis) 35 | 36 | 37 | if __name__ == "__main__": 38 | miaopai_list = [] 39 | platform_dic = { 40 | "haokan": [], 41 | "toutiao": [], 42 | "腾讯视频": [], 43 | "new_tudou": [], 44 | "网易新闻": [], 45 | "腾讯新闻":[] 46 | } 47 | file = r'count.csv' 48 | #file = r'D:\wxfile\WeChat Files\litaolemo\FileStorage\File\2019-07\count(3).csv' 49 | now = int(datetime.datetime.now().timestamp() * 1e3) 50 | executor = ProcessPoolExecutor(max_workers=6) 51 | futures = [] 52 | with open(file, 'r', encoding="gb18030")as f: 53 | header_Lst = f.readline().strip().split(',') 54 | for line in f: 55 | line_Lst = line.strip().split(',') 56 | line_dict = dict(zip(header_Lst, line_Lst)) 57 | releaser = line_dict['releaser'] 58 | platform = line_dict['platform'] 59 | releaserUrl = line_dict['releaserUrl'] 60 | re_s_t = line_dict['开始时间'] 61 | re_e_t = line_dict['结束时间'] 62 | start_time_lis = re_s_t.split("/") 63 | end_time_lis = re_e_t.split("/") 64 | start_time_stamp = int(datetime.datetime(year=int(start_time_lis[0]), month=int(start_time_lis[1]), 65 | day=int(start_time_lis[2])).timestamp() * 1e3) 66 | end_time__stamp = int(datetime.datetime(year=int(end_time_lis[0]), month=int(end_time_lis[1]), 67 | day=int(end_time_lis[2])).timestamp() * 1e3) 68 | # future = executor.submit(start_count, releaser, platform, releaserUrl, start_time_stamp, end_time__stamp) 69 | # futures.append(future) 70 | start_count(releaser, platform, releaserUrl, start_time_stamp, end_time__stamp) 71 | executor.shutdown(True) 72 | print('+++>完成') 73 | # for future in futures: 74 | # print(future.result()) 75 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/2019-09-10.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/2019-09-10.csv -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/6/11 15:59 3 | # @Author : litao 4 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_tencent_news.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/crawler_wangyi_news.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/func_get_releaser_id.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/haokan_count.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/qq_video_count.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/toutiao_count.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_duration_str_to_second.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/trans_strtime_to_timestamp.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/__pycache__/tudou_count.cpython-37.pyc -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/trans_duration_str_to_second.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Sep 10 15:04:04 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | def trans_duration(duration_str): 9 | """suitable for 20:20, 20:20:10""" 10 | duration_lst = duration_str.split(':') 11 | if len(duration_lst) == 3: 12 | duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2])) 13 | return duration 14 | elif len(duration_lst) == 2: 15 | duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1])) 16 | return duration 17 | -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/crawler/首次运行.bat: -------------------------------------------------------------------------------- 1 | python -m pip install --upgrade pip 2 | python -m pip install xlutils 3 | python -m pip install requests 4 | python -m pip install bs4 5 | python -m pip install pandas -------------------------------------------------------------------------------- /crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/crawler_sys/tools/video_num_count/haokan_看看新闻Knews.csv -------------------------------------------------------------------------------- /crawler_sys/tools/write_feihua_json_to_es.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import datetime 4 | import elasticsearch 5 | from write_data_into_es.func_cal_doc_id import * 6 | hosts = '192.168.17.11' 7 | port = 80 8 | user = 'zhouyujiang' 9 | passwd = '8tM9JDN2LVxM' 10 | http_auth = (user, passwd) 11 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 12 | from write_data_into_es.func_get_releaser_id import get_releaser_id 13 | 14 | def dic(): 15 | url_data = [] 16 | format_data = {} 17 | f = open("E:\M_2019-05-22_U_3", "r") 18 | for i in f.readlines(): 19 | url_data.append(json.loads(i)) 20 | for i in url_data: 21 | format_data[i["url"]] = "" 22 | print(len(format_data)) 23 | 24 | def get_data(): 25 | url_data = [] 26 | format_data = [] 27 | # url = "https://enlightent-backup.oss-cn-beijing.aliyuncs.com/csm/20190318-20190324/csm_douyin_20190318_20190324.json?Expires=1554243367&OSSAccessKeyId=TMP.AQG2JUd3g4Gv66npoCNJPVnH-r9yRqhMGwqJtilxiBCDsbRJJ4kTuiE_T17CMC4CFQC8gXq7WHE73SSE9s2DjpWzF7Y2TwIVAIeJz9r0QHkaPi8FGyzN1TXmsjvn&Signature=XsHnMu%2B4agHS6Z6tq%2B55WWaZjDk%3D" 28 | # res = requests.get(url) 29 | # with open("./url_json.json","w") as f: 30 | # f.write(res.text) 31 | 32 | # f = open("E:\M_2019-05-22_U_3", "r") 33 | # for i in f.readlines(): 34 | # url_data.append(json.loads(i)) 35 | for i in url_data: 36 | print(i) 37 | format_data.append( 38 | { 39 | "platform": i["platform"], 40 | "duration": i["duration"], 41 | "favorite_count": i["favorite"], 42 | "fetch_time": int(i["crawledtime"])*1000, 43 | "play_count": i["playtimes"], 44 | "release_time": i["releasetime"], 45 | "releaser": i["releaser"], 46 | "title": i["title"], 47 | "url": i["url"], 48 | "comment_count": i["commentnum"], 49 | "dislike_count": 0, 50 | "isOriginal": False, 51 | "releaserUrl": i["releaserurl"], 52 | "repost_count": 0, 53 | "timestamp": int(datetime.datetime.timestamp(datetime.datetime.now()))*1000, 54 | "data_provider": "fhtech", 55 | "channel": i["channel"], 56 | "releaser_id_str":"miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=i["releaserurl"]) 57 | } 58 | ) 59 | return format_data 60 | # target_date_list = target_type.split('-') 61 | # target_date_start = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]), 1) 62 | # target_date_end = datetime.datetime(int(target_date_list[-3]), int(target_date_list[-2]) + 1, 1) 63 | # target_ts_start = int(target_date_start.timestamp()) * 1000 64 | # target_ts_end = int(target_date_end.timestamp()) * 1000 65 | # print(target_ts_start) 66 | # print(target_ts_end) 67 | 68 | 69 | def write_es(file): 70 | count = 0 71 | bulk_all_body = "" 72 | doc_id_type = "all-time-url" 73 | for i in file: 74 | #print(i) 75 | # format_i = {} 76 | # a = "format_i = %s" % i 77 | # exec(a,format_i) 78 | format_i = json.loads(i) 79 | # format_i = format_i["format_i"] 80 | # print(format_i) 81 | try: 82 | _id = cal_doc_id(platform=format_i["platform"], url=format_i["url"], doc_id_type=doc_id_type,data_dict=format_i) 83 | format_i["timestamp"] = int(datetime.datetime.now().timestamp()*1e3) 84 | if len(str(format_i["release_time"])) != 13: 85 | print(format_i["release_time"]) 86 | format_i["release_time"] = int(format_i["release_time"] / 1000) 87 | format_i["releaser_id_str"] = "miaopai_" + get_releaser_id(platform="miaopai",releaserUrl=format_i["releaserUrl"]) 88 | bulk_head = '{"index": {"_id":"%s"}}' % _id 89 | except Exception as e: 90 | print(e) 91 | continue 92 | # find_exist = { 93 | # "query": { 94 | # "bool": { 95 | # "filter": [ 96 | # {"term": {"_id":_id }} 97 | # ] 98 | # } 99 | # } 100 | # } 101 | # search_re = es.search(index=target_index, doc_type=target_type, 102 | # body=find_exist) 103 | # if search_re['hits']['total'] == 1: 104 | # if counti % 1000 == 0: 105 | # print("done ", counti,"\n") 106 | # return None 107 | # else: 108 | # pass 109 | 110 | data_str = json.dumps(format_i, ensure_ascii=False) 111 | bulk_one_body = bulk_head + '\n' + data_str + '\n' 112 | bulk_all_body += bulk_one_body 113 | count += 1 114 | print("find",count) 115 | if count % 1000 == 0: 116 | eror_dic = es.bulk(index=target_index, doc_type=target_type, 117 | body=bulk_all_body, request_timeout=200) 118 | bulk_all_body = '' 119 | if eror_dic['errors'] is True: 120 | print(eror_dic['items']) 121 | print(bulk_all_body) 122 | print(count) 123 | 124 | if bulk_all_body != '': 125 | eror_dic = es.bulk(body=bulk_all_body, 126 | index=target_index, 127 | doc_type=target_type, 128 | request_timeout=200) 129 | if eror_dic['errors'] is True: 130 | print(eror_dic) 131 | bulk_all_body = '' 132 | #print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url)) 133 | 134 | 135 | if __name__ == '__main__': 136 | target_index = 'short-video-all-time-url' 137 | target_type = 'all-time-url' 138 | m3 = open(r"C:\Users\litao\Desktop\csv\202002\M_2020-02-04_U_3", "r", encoding="utf-8") 139 | # f = open("exists", "a+") 140 | write_es(m3) 141 | -------------------------------------------------------------------------------- /crawler_sys/tools/头条重复数据导出.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 14 16:12:57 2019 4 | 5 | @author: zhouyujiang 6 | 7 | 查找切片中头条发布者+发布时间+duration相同的数据 8 | """ 9 | 10 | import pandas as pd 11 | import datetime 12 | import elasticsearch 13 | from elasticsearch.helpers import scan 14 | from crawler_url_video_info import get_target_video_info 15 | hosts='192.168.17.11' 16 | port=80 17 | user='zhouyujiang' 18 | passwd='8tM9JDN2LVxM' 19 | http_auth=(user, passwd) 20 | es=elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 21 | 22 | 23 | 24 | zyj_set = set() 25 | zyj_dict = {} 26 | index = 'short-video-weekly' 27 | doc_type = 'daily-url-2019_w07_s1' 28 | re_s_t = 1549728000000 29 | re_e_t = 1550332800000 30 | count = 0 31 | sacn_body = { 32 | "query": { 33 | "bool": { 34 | "filter": [ 35 | {"term": {"platform.keyword": 'toutiao'}}, 36 | {"range": {"release_time": {"gte": re_s_t,"lt":re_e_t}}} 37 | ] 38 | } 39 | } 40 | } 41 | scan_re = scan(client=es, index=index, doc_type=doc_type, 42 | query=sacn_body, scroll='3m') 43 | for one in scan_re: 44 | count = count +1 45 | if count %1000 == 0: 46 | print(count) 47 | line = one['_source'] 48 | releaser = line['releaser'] 49 | release_time = line['release_time'] 50 | duration = line['duration'] 51 | zyj_id = releaser + str(release_time) + str(duration) 52 | if zyj_id not in zyj_dict: 53 | zyj_dict[zyj_id] = [] 54 | zyj_dict[zyj_id].append(line) 55 | else: 56 | zyj_set.add(zyj_id) 57 | zyj_dict[zyj_id].append(line) 58 | re_list = [] 59 | for one_key in zyj_set: 60 | for one_value in zyj_dict[one_key]: 61 | # url = one_value['url'] 62 | # new_playcount = get_target_video_info(url=url, platform='toutiao') 63 | # one_value['new_playcount'] = new_playcount 64 | re_list.append(one_value) 65 | 66 | 67 | 68 | 69 | 70 | data = pd.DataFrame(re_list) 71 | data.to_csv('头条7zhou重复数据重新抓取播放量.csv') 72 | 73 | -------------------------------------------------------------------------------- /crawler_sys/utils/__init__ .py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue June 6 18:02:43 2018 4 | 5 | @author: hanye 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /crawler_sys/utils/connect_with_es.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 6 18:18:09 2018 4 | 5 | @author: hanye 6 | """ 7 | #import redis 8 | #from crawler_sys.framework.platform_redis_register import get_redis_list_name 9 | from crawler.crawler_sys.framework.es_crawler import scan_crawler_url_register 10 | 11 | #rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0) 12 | 13 | def pull_url_from_es(platform, release_time_lower_bdr=None): 14 | """ 15 | Just pull urls from es index crawler-url-register. 16 | Url reforming things will be done in the method who 17 | is responsible for pushing urls into redis. 18 | Just return url and its platform 19 | """ 20 | if release_time_lower_bdr is None: 21 | release_time_lower_bdr = 0 22 | else: 23 | pass 24 | search_body = {"query": {"bool": {"filter": [{"range": {"release_time": 25 | {"gte": release_time_lower_bdr}}}, 26 | {"term": {"platform.keyword": platform}}]}}} 27 | total_hit, scan_resp = scan_crawler_url_register(search_body) 28 | batch_url_Lst = [] 29 | if total_hit > 0: 30 | line_counter = 0 31 | for line in scan_resp: 32 | line_counter += 1 33 | line_d = line['_source'] 34 | url = line_d['url'] 35 | batch_url_Lst.append(url) 36 | else: 37 | pass 38 | return batch_url_Lst 39 | 40 | 41 | #def url_reformer(platform, url): 42 | # """ 43 | # to reform url according to platform, in the future. 44 | # Say, a url of http://www.toutiao.com/group/1234567890123456789 45 | # as a string is different from http://www.365yg.com/u/1234567890123456789, 46 | # but they point to the same resource. They should be reformed 47 | # to one unique url before pushing into redis for futher crawling. 48 | # """ 49 | # reformed_url = url 50 | # return reformed_url 51 | # 52 | #def feed_url_into_redis(dict_Lst, platform, 53 | # release_time_lower_bdr=None, 54 | # batch_str=None): 55 | # """ 56 | # release_time_lower_bdr must be an int value represent 57 | # timestamp in milliseconds if given. 58 | # All url that is released before release_time_lower_bdr 59 | # will not be pushed into redis. If argument release_time_lower_bdr 60 | # is not given when call this function, all urls will be 61 | # pushed into redis. 62 | # """ 63 | # redis_list_name = get_redis_list_name(platform, batch_str) 64 | # if redis_list_name is None: 65 | # print('Failed to get correct redis list name ' 66 | # 'in platform_redis_register for platform: ' 67 | # % platform) 68 | # return (None, None) 69 | # else: 70 | # print('Feeding url into redis list %s ...' % redis_list_name) 71 | # url_counter = 0 72 | # for data_dict in dict_Lst: 73 | # try: 74 | # url = data_dict['url'] 75 | # url_reformed = url_reformer(platform, url) 76 | # if release_time_lower_bdr is None: 77 | # sadd_c = rds.sadd(redis_list_name, url_reformed) 78 | # url_counter += sadd_c 79 | # else: 80 | # url_release_time = data_dict['release_time'] 81 | # if url_release_time >= release_time_lower_bdr: 82 | # sadd_c = rds.sadd(redis_list_name, url_reformed) 83 | # url_counter += sadd_c 84 | # except: 85 | # print('Failed to push url into redis, ' 86 | # 'might because of lack of url field ' 87 | # 'or lack of release_time field, or ' 88 | # 'has wrong typed release_time value. ' 89 | # 'The failed data dict is: \n %s' % data_dict) 90 | # print('Pushed %d urls into redis' % url_counter) 91 | # return (redis_list_name, url_counter) 92 | 93 | -------------------------------------------------------------------------------- /crawler_sys/utils/date_calculator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 21 09:08:29 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import datetime 9 | 10 | def calculator(shifting_days=30, 11 | shifting_hours=0, 12 | shifting_minutes=0): 13 | now = datetime.datetime.now() 14 | if shifting_hours == 0 and shifting_minutes == 0 and shifting_days != 0: 15 | date_shift = now - datetime.timedelta(days=shifting_days) 16 | date_shift_str = str(date_shift)[:10] 17 | date_wanted = datetime.datetime.strptime(date_shift_str, 18 | "%Y-%m-%d").timestamp() * 1e3 19 | return int(date_wanted) -------------------------------------------------------------------------------- /crawler_sys/utils/delete_repeat_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Sep 30 09:36:55 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import pymysql 10 | from crawler_sys.utils.write_into_database import write_lst_into_database 11 | 12 | connection = pymysql.connect(host='localhost', 13 | user='root', 14 | passwd='goalkeeper@1', 15 | db='proxy_pool', 16 | port=3306, 17 | cursorclass=pymysql.cursors.DictCursor) 18 | cursor = connection.cursor() 19 | 20 | search_sql = "select * from proxy_pool" 21 | cursor.execute(search_sql) 22 | 23 | test_lst = cursor.fetchall() 24 | 25 | new_lst = [] 26 | ip_lst = [] 27 | for line in test_lst: 28 | if line['ip_address'] not in ip_lst: 29 | new_lst.append(line) 30 | ip_lst.append(line['ip_address']) 31 | 32 | delect_sql = "delete from proxy_pool where id >= 1" 33 | cursor.execute(delect_sql) 34 | connection.commit() 35 | 36 | write_lst_into_database(data_lst=new_lst, 37 | table_name='proxy_pool', 38 | host='localhost', 39 | passwd='goalkeeper@1') -------------------------------------------------------------------------------- /crawler_sys/utils/extract_data_from_es.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Oct 8 17:14:16 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import elasticsearch 9 | 10 | 11 | hosts = '192.168.17.11' 12 | port = 80 13 | user_id = 'fangyucheng' 14 | password = 'VK0FkWf1fV8f' 15 | http_auth = (user_id, password) 16 | lose_re_url = [] 17 | es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 18 | 19 | search_body = {"query": { 20 | "bool": { 21 | "filter": [ 22 | {"term": {"platform.keyword": "new_tudou"}}, 23 | {"term": {"post_by.keyword": "zhangqiongzi"}} 24 | ] 25 | } 26 | } 27 | } 28 | 29 | 30 | get_tr = es.search(index='target_releasers', body=search_body, size=200) 31 | 32 | result_lst = [] 33 | 34 | for line in get_tr['hits']['hits']: 35 | result_lst.append(line['_source']) -------------------------------------------------------------------------------- /crawler_sys/utils/fakechrome.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Aug 16 17:12:37 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import random 9 | 10 | first_num = random.randint(55, 62) 11 | third_num = random.randint(0, 3200) 12 | fourth_num = random.randint(0, 140) 13 | 14 | class Fakechrome: 15 | os_type = ['(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)', 16 | '(Macintosh; Intel Mac OS X 10_12_6)'] 17 | 18 | chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num) 19 | 20 | @classmethod 21 | def get_ua(cls): 22 | return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36', 23 | '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36']) 24 | 25 | headers = {'User-Agent': Fakechrome.get_ua(), 26 | 'Accept-Encoding': 'gzip, deflate, sdch', 27 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 29 | 'Connection': 'keep-alive'} 30 | -------------------------------------------------------------------------------- /crawler_sys/utils/fix_album_play_count_by_releaser_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 23 14:44:07 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | from elasticsearch import Elasticsearch 10 | from crawler.crawler_sys.utils.trans_format import lst_to_csv 11 | 12 | 13 | hosts = '192.168.17.11' 14 | port = 80 15 | user_id = 'fangyucheng' 16 | password = 'VK0FkWf1fV8f' 17 | http_auth = (user_id, password) 18 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 19 | 20 | unsolve2_lst = [] 21 | result_lst2 =[] 22 | 23 | for url in unsolve_lst: 24 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}} 25 | search = es_connection.search(index="test2", doc_type="fyc1123", body=search_body) 26 | if search["hits"]["total"] == 0: 27 | unsolve2_lst.append(url) 28 | print("can not get video data at %s" % url) 29 | else: 30 | video_data = search["hits"]["hits"][0]["_source"] 31 | result_lst2.append(video_data) 32 | print("get playcount at %s" % url) 33 | 34 | lst_to_csv(listname=result_lst2, 35 | csvname="F:/add_target_releaser/Nov/get_playcount_by_releaser2.csv") -------------------------------------------------------------------------------- /crawler_sys/utils/get_toutiao_as_cp_signature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 23 12:37:48 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import requests 9 | # import js2py 10 | import hashlib 11 | 12 | 13 | def as_cp(user_id, max_behot_time): 14 | as_cp = [] 15 | t = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3)') 16 | i = js2py.eval_js('var t = Math.floor((new Date).getTime() / 1e3),i = t.toString(16).toUpperCase()') 17 | e = hashlib.md5(str(t).encode('utf-8')).hexdigest() 18 | if len(i) != 8: 19 | var_as = "479BB4B7254C150" 20 | cp = "7E0AC8874BB0985" 21 | else: 22 | e = e.upper() 23 | s = e[0:5] 24 | o = e[-5:] 25 | 26 | n = '' 27 | a = 0 28 | while a < 5: 29 | n = n + s[a] + i[a] 30 | a += 1 31 | 32 | l = '' 33 | r = 0 34 | while r < 5: 35 | l = l + i[r+3] + o[r] 36 | r = r + 1 37 | var_as = 'A1' + n + i[-3:] 38 | cp = i[0:3] + l + 'E1' 39 | 40 | as_cp.append(var_as) 41 | as_cp.append(cp) 42 | return as_cp 43 | 44 | 45 | def signature(user_id, max_behot_time): 46 | jsurl = 'https://s3.pstatp.com/toutiao/resource/ntoutiao_web/page/profile/index_f62209a.js' 47 | get_page = requests.get(jsurl) 48 | get_page.encoding = 'utf-8' 49 | page = get_page.text 50 | effect_js = page.split('Function') 51 | js_1 = ('var navigator = {"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) ' 52 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"};') 53 | js_2 = 'Function' + effect_js[3] 54 | js_3 = 'Function' + effect_js[4] 55 | js_4 = ';function result(){ return TAC.sign('+user_id+''+max_behot_time+');} result();' 56 | js_total = js_1+js_2+js_3+js_4 57 | signature = js2py.eval_js(js_total) 58 | return signature 59 | -------------------------------------------------------------------------------- /crawler_sys/utils/output_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Dec 19 12:54:20 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import logging 9 | formatter = logging.Formatter('%(asctime)s %(name)s %(filename)s ' 10 | '%(funcName)s %(levelname)s %(message)s') 11 | 12 | def init_logger(name, log_file, level=logging.INFO): 13 | """initialize logger""" 14 | #output log to file 15 | handler = logging.FileHandler(log_file) 16 | handler.setFormatter(formatter) 17 | #output log to screen 18 | console = logging.StreamHandler() 19 | console.setFormatter(formatter) 20 | #initialize logger 21 | logger = logging.getLogger(name) 22 | logger.setLevel(level) 23 | #add handler and console to logger 24 | logger.addHandler(handler) 25 | logger.addHandler(console) 26 | return logger -------------------------------------------------------------------------------- /crawler_sys/utils/parse_bool_for_args.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 29 12:46:19 2018 4 | 5 | @author: hanye 6 | """ 7 | 8 | def parse_bool_for_args(arg_str): 9 | if arg_str.lower() in ('true', 'yes', 'y', '1'): 10 | return True 11 | elif arg_str.lower() in ('false', 'no', 'n', '0'): 12 | return False 13 | else: 14 | print('Illegal input! Bool like string values are needed.') 15 | return None 16 | -------------------------------------------------------------------------------- /crawler_sys/utils/releaser_url_check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 6 09:42:41 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | from crawler_sys.framework import platform_crawler_register 10 | 11 | 12 | def test_releaserUrl(test_lst): 13 | for line in test_lst: 14 | try: 15 | platform = line['platform'] 16 | platform_crawler = platform_crawler_register.get_crawler(platform) 17 | releaserUrl = line['releaserUrl'] 18 | try: 19 | platform_crawler().releaser_page(releaserUrl=releaserUrl, 20 | releaser_page_num_max=1) 21 | line['True_or_False'] = 1 22 | line['add_mess'] = 'correct' 23 | print('get releaser page') 24 | print(line) 25 | yield line 26 | except: 27 | line['True_or_False'] = 0 28 | line['add_mess'] = 'wrong_url' 29 | print('%s can not get vaild info' % releaserUrl) 30 | except: 31 | pass 32 | -------------------------------------------------------------------------------- /crawler_sys/utils/remove_unrelated_parameter_tudou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 26 13:14:03 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | import urllib 10 | import requests 11 | 12 | domain = 'http://www.tudou.com/api/getfeeds?' 13 | #domain_simple = 'http://r.inews.qq.com/searchMore' 14 | #headers = {"Host": "r.inews.qq.com", 15 | # "Accept-Encoding": "gzip,deflate", 16 | # "Referer": "http://inews.qq.com/inews/android/", 17 | # "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)", 18 | # "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;", 19 | # "Connection": "Keep-Alive"} 20 | 21 | url_dic = {'apptype': '27', 22 | 'pg': '8', 23 | 'module': '10', 24 | 'feedtype': '2', 25 | 'guid': '15010560241384qU', 26 | 'uid': '-1', 27 | 'secCateId': '10016', 28 | 'max_offset': '-1', 29 | 'min_offset': '-1', 30 | 'utdid': 'T8v9EQPOimUCAXL/Az0YrDOB', 31 | 'page_size': '24'} 32 | 33 | #sig = 'c022edce8ae72f053304412f13a9bb88' 34 | 35 | url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic) 36 | 37 | """ 38 | raw-url 39 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862' 40 | """ 41 | 42 | get_page = requests.get(url) 43 | page = get_page.text 44 | 45 | key_lst = [] 46 | para_lst = [] 47 | 48 | for key, value in url_dic.items(): 49 | key_lst.append(key) 50 | 51 | for key in key_lst: 52 | value = url_dic[key] 53 | url_dic.pop(key) 54 | url = 'http://www.tudou.com/api/getfeeds?%s' % urllib.parse.urlencode(url_dic) 55 | get_page = requests.get(url) 56 | page = get_page.text 57 | time.sleep(5) 58 | if len(page) > 15000: 59 | print("%s can be moved from url, length of page is %s" % (key, len(page))) 60 | continue 61 | else: 62 | url_dic[key] = value 63 | print("key %s, value %s can't be moved from url" % (key, value)) 64 | 65 | #this the result 66 | final_url_dic = {'devid': '008796749793280', 67 | 'appver': '23_android_5.4.10', 68 | 'query': 'espn', 69 | 'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47', 70 | 'qn-sig': 'c022edce8ae72f053304412f13a9bb88'} 71 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic) 72 | 73 | """ 74 | final_url 75 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88' 76 | """ 77 | 78 | -------------------------------------------------------------------------------- /crawler_sys/utils/trans_duration_str_to_second.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Sep 10 15:04:04 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | def trans_duration(duration_str): 9 | """suitable for 20:20, 20:20:10""" 10 | if type(duration_str) == int: 11 | return duration_str 12 | duration_lst = duration_str.split(':') 13 | if len(duration_lst) == 3: 14 | duration = int(int(duration_lst[0]) * 3600 + int(duration_lst[1]) * 60 + int(duration_lst[2])) 15 | return duration 16 | elif len(duration_lst) == 2: 17 | duration = int(int(duration_lst[0]) * 60 + int(duration_lst[1])) 18 | return duration 19 | else: 20 | return duration_lst[0] 21 | -------------------------------------------------------------------------------- /crawler_sys/utils/trans_format.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 14 13:35:26 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import json 10 | import pandas as pd 11 | 12 | 13 | def dic_file_to_lst(filename): 14 | openfile = open(filename) 15 | task = [] 16 | for line in openfile: 17 | line_dic = json.loads(line) 18 | task.append(line_dic) 19 | return task 20 | 21 | 22 | def str_file_to_lst(filename): 23 | openfile = open(filename, 'r', encoding='utf-8') 24 | task = [] 25 | for line in openfile: 26 | line_str = line.replace('\n','') 27 | task.append(line_str) 28 | return task 29 | 30 | 31 | def csv_to_lst(csvname): 32 | openfile = open(csvname, 'r') 33 | task = [] 34 | for line in openfile: 35 | task.append(line) 36 | task_lst = [] 37 | for line in task: 38 | line_lst = line.split(',') 39 | task_lst.append(line_lst) 40 | return task_lst 41 | 42 | 43 | def csv_to_lst_with_headline(csvname): 44 | result_lst = [] 45 | openfile = open(csvname, 'r', encoding='gb18030') 46 | head = openfile.readline() 47 | head = head.replace('\n', '') 48 | head_lst = head.strip().split(',') 49 | for line in openfile: 50 | line = line.replace('\n', '') 51 | line_lst = line.strip().split(',') 52 | test_dict = dict(zip(head_lst,line_lst)) 53 | result_lst.append(test_dict) 54 | return result_lst 55 | 56 | 57 | def dic_lst_to_file(listname, filename): 58 | file = open(filename, 'a') 59 | for line in listname: 60 | json_line = json.dumps(line) 61 | file.write(json_line) 62 | file.write('\n') 63 | file.flush() 64 | file.close() 65 | 66 | 67 | def str_lst_to_file(listname, filename): 68 | file = open(filename, 'a', encoding='gb18030') 69 | for line in listname: 70 | file.write(line) 71 | file.write('\n') 72 | file.flush() 73 | file.close() 74 | 75 | 76 | def lst_to_csv_practical(lst_name, csv_name): 77 | dataframe=pd.DataFrame(lst_name) 78 | if len(str(lst_name[0]['release_time'])) == 10: 79 | dataframe['midstep'] = dataframe['release_time']+8*3600 80 | dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='s') 81 | else: 82 | dataframe['midstep'] = dataframe['release_time']+8*3600*1e3 83 | dataframe['realtime'] = pd.to_datetime(dataframe['midstep'],unit='ms') 84 | del dataframe['midstep'] 85 | del dataframe['release_time'] 86 | del dataframe['fetch_time'] 87 | dataframe.to_csv(csv_name, encoding='gb18030', index=False) 88 | 89 | 90 | def lst_to_csv(listname, csvname): 91 | dataframe = pd.DataFrame(listname) 92 | dataframe.to_csv(csvname, encoding='gb18030', index=False) 93 | 94 | 95 | def csv_to_file(filename, csvname): 96 | file = open(filename) 97 | task = [] 98 | for line in task: 99 | json_line = json.dumps(line) 100 | file.write(json_line) 101 | file.write('\n') 102 | file.flush() 103 | dataframe = pd.DataFrame(task) 104 | dataframe.to_csv(csvname, encoding='gb18030', index=False) 105 | file.close() 106 | -------------------------------------------------------------------------------- /crawler_sys/utils/trans_str_play_count_to_int.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Sep 10 14:41:54 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | def trans_play_count(play_count_str): 9 | """suitable for the format 22万, 22万次播放, 22.2万, 2,222万, 2,222.2万, 2,222, 222""" 10 | if isinstance(play_count_str,int): 11 | return play_count_str 12 | 13 | play_count_str = play_count_str.replace('次播放', '') 14 | play_count_str = play_count_str.replace('播放', '') 15 | try: 16 | if '万' in play_count_str: 17 | play_count_str = play_count_str.split('万')[0] 18 | if ',' in play_count_str: 19 | play_count_str = play_count_str.replace(',', '') 20 | play_count = int(float(play_count_str) * 1e4) 21 | return play_count 22 | elif "w" in play_count_str: 23 | play_count_str = play_count_str.split('w')[0] 24 | if ',' in play_count_str: 25 | play_count_str = play_count_str.replace(',', '') 26 | play_count = int(float(play_count_str) * 1e4) 27 | return play_count 28 | else: 29 | try: 30 | play_count = int(play_count_str) 31 | except: 32 | play_count = int(play_count_str.replace(',', '')) 33 | return play_count 34 | except: 35 | return None -------------------------------------------------------------------------------- /crawler_sys/utils/util_logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 20 10:07:00 2019 5 | 6 | @author: hanye 7 | """ 8 | import datetime 9 | import logging 10 | import logging.handlers 11 | from functools import wraps 12 | 13 | 14 | def logged(func): 15 | """ 16 | Decorator to log crawler task. 17 | """ 18 | @wraps(func) 19 | def with_logging(*args, **kwargs): 20 | today_str = datetime.datetime.now().isoformat()[:10] 21 | log_folder_name = 'crawler_log' 22 | crawler_pth = '/home/hanye/crawlersNew/crawler' 23 | LOG_FN = ('crawler_task_%s_%s_log' % (func.__name__, today_str)) 24 | log_fn_abs_path = '/'.join([crawler_pth, log_folder_name, LOG_FN]) 25 | FORMAT = '[%(asctime)s][runningFunction:%(name)s][logModule:%(module)s][pid:%(process)d] %(message)s' 26 | hy_logger = logging.getLogger(func.__name__) 27 | hy_logger.setLevel(logging.INFO) 28 | formatter = logging.Formatter(fmt=FORMAT) 29 | file_handler = logging.FileHandler(filename=log_fn_abs_path) 30 | file_handler.setFormatter(formatter) 31 | file_handler.setLevel(logging.INFO) 32 | hy_logger.addHandler(file_handler) 33 | 34 | hy_logger.info('task starts') 35 | argstr = '' 36 | if args: 37 | argstr += args.__str__() 38 | if kwargs: 39 | argstr += kwargs.__str__() 40 | if argstr: 41 | hy_logger.info('args:%s' % argstr) 42 | 43 | return func(*args, **kwargs) 44 | return with_logging 45 | 46 | 47 | -------------------------------------------------------------------------------- /crawler_sys/utils/write_into_database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Sep 30 10:45:11 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | 10 | import pymysql 11 | from crawler_sys.proxy_pool import make_up_sql 12 | 13 | 14 | def write_dic_into_database(data_dic, 15 | host='192.168.18.11', 16 | user='root', 17 | passwd='csm@1234', 18 | database_name='proxy_pool', 19 | table_name='proxy_pool'): 20 | """ 21 | write dict into mysql database 22 | """ 23 | 24 | connection = pymysql.connect(host=host, user=user, passwd=passwd, 25 | db=database_name, port=3306, 26 | cursorclass=pymysql.cursors.DictCursor) 27 | cursor = connection.cursor() 28 | 29 | write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=data_dic) 30 | cursor.execute(write_into_sql) 31 | connection.commit() 32 | print('write %s into database' % data_dic['ip_address']) 33 | 34 | 35 | def write_lst_into_database(data_lst, 36 | host='192.168.18.11', 37 | user='root', 38 | passwd='csm@1234', 39 | database_name='proxy_pool', 40 | table_name='proxy_pool'): 41 | """ 42 | write list into mysql database 43 | """ 44 | connection = pymysql.connect(host=host, user=user, passwd=passwd, 45 | db=database_name, port=3306, 46 | cursorclass=pymysql.cursors.DictCursor) 47 | cursor = connection.cursor() 48 | length = len(data_lst) 49 | 50 | for line in data_lst: 51 | write_into_sql = make_up_sql.make_up_replace_sql(table_name=table_name, input_dic=line) 52 | cursor.execute(write_into_sql) 53 | 54 | connection.commit() 55 | print('write %s pieces of data into database' % length) -------------------------------------------------------------------------------- /crawler_sys/utils/write_into_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Oct 11 09:04:54 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | 9 | import datetime 10 | import json 11 | 12 | current_date = datetime.datetime.now().isoformat()[:10] 13 | 14 | def write_str_into_file(file_path, 15 | file_name, 16 | var): 17 | with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file: 18 | file.write(var) 19 | file.write('\n') 20 | 21 | def write_dic_into_file(file_path, 22 | file_name, 23 | var): 24 | with open(file_path+file_name+current_date, 'a', encoding='utf-8') as file: 25 | var_json = json.dumps(var) 26 | file.write(var_json) 27 | file.write('\n') 28 | -------------------------------------------------------------------------------- /crawler_sys/utils/write_lsturl_info_conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 10 17:06:49 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import os 9 | import configparser 10 | 11 | conf_file_path = os.getcwd() 12 | tencent_dic = {"platform": "腾讯视频", 13 | "channel": {"音乐": "http://v.qq.com/x/list/music", 14 | "新闻": "http://v.qq.com/x/list/news", 15 | "军事": "http://v.qq.com/x/list/military", 16 | "娱乐": "http://v.qq.com/x/list/ent", 17 | "体育": "http://v.qq.com/x/list/sports", 18 | "游戏": "http://v.qq.com/x/list/games", 19 | "搞笑": "http://v.qq.com/x/list/fun", 20 | "时尚": "http://v.qq.com/x/list/fashion", 21 | "生活": "http://v.qq.com/x/list/life", 22 | "母婴": "http://v.qq.com/x/list/baby", 23 | "汽车": "http://v.qq.com/x/list/auto", 24 | "科技": "http://v.qq.com/x/list/tech", 25 | "教育": "http://v.qq.com/x/list/education", 26 | "财经": "http://v.qq.com/x/list/finance", 27 | "房产": "http://v.qq.com/x/list/house", 28 | "旅游": "http://v.qq.com/x/list/travel", 29 | "王者荣耀": "http://v.qq.com/x/list/kings"}} 30 | 31 | target_lst_page_lst = [tencent_dic,] 32 | 33 | #initialize conf file 34 | config = configparser.ConfigParser() 35 | for platform_dic in target_lst_page_lst: 36 | config[platform_dic['platform']] = platform_dic['channel'] 37 | with open(conf_file_path + '/lst_page_conf.ini', 38 | 'w', encoding='utf-8') as configfile: 39 | config.write(configfile) 40 | -------------------------------------------------------------------------------- /crawler_sys/utils/write_releasers_into_ini_from_csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Oct 23 10:03:53 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import configparser 9 | from crawler.crawler_sys.utils.trans_format import csv_to_lst_with_headline 10 | 11 | task_list = csv_to_lst_with_headline('F:/add_target_releaser/last_month/zhangminghui2.csv') 12 | 13 | releaser_dic = {} 14 | for line in task_list: 15 | releaser_dic[line['releaser']] = line['releaserUrl'] 16 | 17 | 18 | config = configparser.ConfigParser() 19 | config['haokan'] = releaser_dic 20 | 21 | with open ('key_customer.ini', 'w', encoding='utf-8') as ini: 22 | config.write(ini) 23 | 24 | 25 | 26 | #special task 27 | #for line in source_lst: 28 | # detail_lst = line['detail'] 29 | # csm_mdu = detail_lst[0]['csm_mdu'] 30 | # for detail_dic in detail_lst: 31 | # detail_dic.pop('csm_mdu') 32 | # line['csm_mdu'] = csm_mdu -------------------------------------------------------------------------------- /crawler_sys/utils/write_search_keyword_info_conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 10 17:06:49 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import os 9 | import configparser 10 | 11 | conf_file_path = os.getcwd() 12 | #initialize conf file 13 | config = configparser.ConfigParser() 14 | config['腾讯新闻'] = {'keyword': '看看新闻,看看新闻Knews,Knews'} 15 | with open('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini', 16 | 'w', encoding='utf-8') as configfile: 17 | config.write(configfile) 18 | print(os.getcwd()) 19 | 20 | config = configparser.ConfigParser() 21 | config.read('D:/python_code/crawler/crawler_sys/framework/config/search_keywords.ini') -------------------------------------------------------------------------------- /dev/CCTV_live/bilibili.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/28 16:18 3 | # @Author : litao 4 | # -*- coding:utf-8 -*- 5 | # @Time : 2020/5/28 15:19 6 | # @Author : litao 7 | from selenium import webdriver 8 | import redis, datetime,time,json 9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count 10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True) 11 | 12 | class CrawlerMain(object): 13 | def __init__(self): 14 | self.chrome_options = webdriver.ChromeOptions() 15 | # self.chrome_options.add_argument('--disable-gpu') 16 | self.chrome_options.add_argument("--start-maximized") 17 | self.chrome_options.add_argument("--no-sandbox") 18 | self.chrome_options.add_argument('disable-infobars') 19 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/ 20 | self.driver = webdriver.Chrome(options=self.chrome_options) 21 | 22 | def login(self): 23 | self.driver.get("https://live.bilibili.com/21686237?from=search&seid=1739181021049557638") 24 | while True: 25 | now = datetime.datetime.now() 26 | res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']") 27 | if res: 28 | play_count = trans_play_count(res[0].text) 29 | print(play_count) 30 | dic = { 31 | "menber":play_count, 32 | "fetch_time":int(now.timestamp()*1e3) 33 | } 34 | rds.rpush("bilibili", json.dumps(dic)) 35 | time.sleep(200) 36 | 37 | def __exit__(self): 38 | self.driver.close() 39 | 40 | if __name__ == "__main__": 41 | test = CrawlerMain() 42 | test.login() 43 | -------------------------------------------------------------------------------- /dev/CCTV_live/toutiao.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/28 15:19 3 | # @Author : litao 4 | from selenium import webdriver 5 | import redis, datetime,time,json 6 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count 7 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True) 8 | 9 | class CrawlerMain(object): 10 | def __init__(self): 11 | self.chrome_options = webdriver.ChromeOptions() 12 | # self.chrome_options.add_argument('--disable-gpu') 13 | self.chrome_options.add_argument("--start-maximized") 14 | self.chrome_options.add_argument("--no-sandbox") 15 | self.chrome_options.add_argument('disable-infobars') 16 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/ 17 | self.driver = webdriver.Chrome(options=self.chrome_options) 18 | 19 | def login(self): 20 | self.driver.get("https://live.ixigua.com/room/6831736034540456716/") 21 | while True: 22 | now = datetime.datetime.now() 23 | res = self.driver.find_elements_by_xpath("//span[@class='action-text v-middle live-skin-normal-text dp-i-block']") 24 | if res: 25 | play_count = trans_play_count(res[0].text) 26 | print(play_count) 27 | dic = { 28 | "menber":play_count, 29 | "fetch_time":int(now.timestamp()*1e3) 30 | } 31 | rds.rpush("toutiao", json.dumps(dic)) 32 | time.sleep(200) 33 | 34 | def __exit__(self): 35 | self.driver.close() 36 | 37 | if __name__ == "__main__": 38 | test = CrawlerMain() 39 | test.login() 40 | -------------------------------------------------------------------------------- /dev/CCTV_live/yangshipin.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2020/5/28 16:21 3 | # @Author : litao 4 | # -*- coding:utf-8 -*- 5 | # @Time : 2020/5/28 15:19 6 | # @Author : litao 7 | from selenium import webdriver 8 | import redis, datetime,time,json 9 | from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count 10 | rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True) 11 | 12 | class CrawlerMain(object): 13 | def __init__(self): 14 | self.chrome_options = webdriver.ChromeOptions() 15 | # self.chrome_options.add_argument('--disable-gpu') 16 | self.chrome_options.add_argument("--start-maximized") 17 | self.chrome_options.add_argument("--no-sandbox") 18 | self.chrome_options.add_argument('disable-infobars') 19 | # 请根据chrome版本 下载chrome driver http://npm.taobao.org/mirrors/chromedriver/ 20 | self.driver = webdriver.Chrome(options=self.chrome_options) 21 | 22 | def login(self): 23 | self.driver.get("https://m.yangshipin.cn/video?type=2&vid=2004011401&pid=600036243&ptag=4_1.4.2.20898_wxf") 24 | while True: 25 | now = datetime.datetime.now() 26 | res = self.driver.find_elements_by_xpath("//span[@class='p-video-intro-person']") 27 | if res: 28 | play_count = trans_play_count(res[0].text) 29 | print(play_count) 30 | dic = { 31 | "menber":play_count, 32 | "fetch_time":int(now.timestamp()*1e3) 33 | } 34 | rds.rpush("toutiao", json.dumps(dic)) 35 | time.sleep(200) 36 | 37 | def __exit__(self): 38 | self.driver.close() 39 | 40 | if __name__ == "__main__": 41 | test = CrawlerMain() 42 | test.login() 43 | -------------------------------------------------------------------------------- /dev/iqiyi_debug/test_aiohttp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Dec 7 11:02:17 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import aiohttp 9 | import asyncio 10 | 11 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html', 12 | 'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html', 13 | 'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html', 14 | 'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html',] 15 | 16 | 17 | 18 | 19 | async def download_page(session, url): 20 | get_page = await session.get(url) 21 | page = await get_page.text("utf-8", errors="ignore") 22 | return page 23 | 24 | async def get_list_page(loop): 25 | async with aiohttp.ClientSession() as list_page_sess: 26 | task = [loop.create_task(download_page(list_page_sess, url)) for url in task_list] 27 | done, pending = await asyncio.wait(task) 28 | result_lst = [d.result() for d in done] 29 | print(result_lst) 30 | 31 | loop = asyncio.get_event_loop() 32 | loop.run_until_complete(get_list_page(loop)) -------------------------------------------------------------------------------- /dev/iqiyi_debug/test_iqiyi_list_url.py: -------------------------------------------------------------------------------- 1 | task_list = ['http://list.iqiyi.com/www/10/1007-------------4-8-2--1-.html', 2 | 'http://list.iqiyi.com/www/24/-------------4-27-2-iqiyi--.html', 3 | 'http://list.iqiyi.com/www/28/-------------4-12-2-iqiyi-1-.html', 4 | 'http://list.iqiyi.com/www/17/-------------4-11-2-iqiyi--.html', 5 | 'http://list.iqiyi.com/www/10/1007-------------4-1-2--1-.html', 6 | 'http://list.iqiyi.com/www/28/-------------4-13-2-iqiyi-1-.html', 7 | 'http://list.iqiyi.com/www/24/-------------4-23-2-iqiyi--.html', 8 | 'http://list.iqiyi.com/www/10/1006-------------4-27-2--1-.html', 9 | 'http://list.iqiyi.com/www/10/1007-------------4-6-2--1-.html', 10 | 'http://list.iqiyi.com/www/10/1006-------------4-4-2--1-.html', 11 | 'http://list.iqiyi.com/www/28/-------------4-14-2-iqiyi-1-.html', 12 | 'http://list.iqiyi.com/www/24/-------------4-5-2-iqiyi--.html', 13 | 'http://list.iqiyi.com/www/8/-------------4-13-2-iqiyi--.html', 14 | 'http://list.iqiyi.com/www/10/1007-------------4-30-2--1-.html', 15 | 'http://list.iqiyi.com/www/8/-------------4-8-2-iqiyi--.html', 16 | 'http://list.iqiyi.com/www/8/-------------4-16-2-iqiyi--.html', 17 | 'http://list.iqiyi.com/www/28/-------------4-9-2-iqiyi-1-.html', 18 | 'http://list.iqiyi.com/www/7/-------------4-23-2-iqiyi-1-.html', 19 | 'http://list.iqiyi.com/www/5/-------------4-10-2-iqiyi--.html', 20 | 'http://list.iqiyi.com/www/17/-------------4-4-2-iqiyi--.html', 21 | 'http://list.iqiyi.com/www/8/-------------4-4-2-iqiyi--.html', 22 | 'http://list.iqiyi.com/www/25/-------------4-3-2-iqiyi-1-.html', 23 | 'http://list.iqiyi.com/www/5/-------------4-18-2-iqiyi--.html', 24 | 'http://list.iqiyi.com/www/3/-------------4-24-2-iqiyi--.html', 25 | 'http://list.iqiyi.com/www/3/-------------4-19-2-iqiyi--.html', 26 | 'http://list.iqiyi.com/www/8/-------------4-21-2-iqiyi--.html', 27 | 'http://list.iqiyi.com/www/5/-------------4-9-2-iqiyi--.html', 28 | 'http://list.iqiyi.com/www/10/1006-------------4-28-2--1-.html', 29 | 'http://list.iqiyi.com/www/5/-------------4-27-2-iqiyi--.html', 30 | 'http://list.iqiyi.com/www/7/-------------4-26-2-iqiyi-1-.html'] 31 | 32 | -------------------------------------------------------------------------------- /dev/post_dict.txt: -------------------------------------------------------------------------------- 1 | {"param":" 2 | vPCnptKLNQAoHKbY9AUySUt3US0zNbhxzqZyO4IlZhCUZu0jMdEoQOSfkaDq 3 | bHw6e3k2kHR1VZB/FL+mJqHkZKYM812xpBR5ZUErEl0if5UxXIj51Ena6OAC 4 | LYnl+BHzISJEi0Ne7+2xXr/lUHZM4qDuJz8fd3xYXN2Xtwf32yAprr8c24GO 5 | IrLeM4e3fi9phnrfke34CMw8uYUX9wbujIFFG0Ln3pG+V2iPG3iBf654QVeF 6 | 3obzc2sOSvD1uCuP6J7jVQbLDphXxeQaXSKHqL5kdScU3QYDCnbYTMiJXq+A 7 | EMJ8/V4s+eKwRiWHvmtqcHLiTi81gQyJTQWY4D7v9RMe/RcUJDgvoFMNLBHt 8 | 4GQ+ow8QY5t6JLiVH7LsXIjhjF/gYRwn0unLBz0g4J+mGdAdh4+Go8EaJBKV 9 | 7fkFn/QaIGsyhyqfNn/fJc/btvCqgvdDeLphjU2WeFaA74DgD4ElDfNflh0T 10 | 50y0/nF8Vc5PQB1zA6R6ipuM5XkEuPLLZ4362p6T+2pm1p8XUF3GTGWteW+/ 11 | gwLoPxCOUQ6rZ4m+WIF8MQXJWht3JJRY+J5sy15017kKC5HQDxSemBbPokBD 12 | vX6UGHfqvVgZSRdC83fVp/WtcDIC36y5YPQc01XZ5dfrytfaOkjd/PWCsUnX 13 | pRkytYh+1OwPnD0j3/JsBlomf3V3STlnjdXY4qC/6isIFXya3D1fn/vrtbVF 14 | kXga30Jw4nPEnHuQCdwtwxm09ZOxy0/8RNTwov2oErg19Jcxhn6OUs0Ccm5z 15 | DxVmZtTJpmHTIeLdaYL5r2ONoYciHikj3z6NQ2DrQ3s4AVQTMWayA8V0ENto 16 | aSI="} -------------------------------------------------------------------------------- /tasks/check_high_play_count_data_source_v_qq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 5 17:52:53 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq 9 | from crawler_sys.utils.output_results import output_result 10 | from crawler_sys.utils import Metaorphosis as meta 11 | from crawler_sys.utils.output_log import output_log 12 | 13 | logging = output_log(page_category='video_page', 14 | program_info='tencent') 15 | 16 | def tran_input_data_to_lst(file_name, file_category='csv'): 17 | if file_category == 'csv': 18 | video_info_lst = meta.csv_to_lst_whth_headline(file_name) 19 | url_lst = [] 20 | for line in video_info_lst: 21 | try: 22 | if line['data_provider'] == 'CCR': 23 | url_lst.append(line['url']) 24 | except: 25 | pass 26 | return url_lst 27 | elif file_category == 'file': 28 | url_lst = meta.str_file_to_lst(file_name) 29 | return url_lst 30 | 31 | url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file') 32 | 33 | crawler = Crawler_v_qq() 34 | get_video_page = crawler.video_page 35 | 36 | def get_data_source(url_lst=url_lst, 37 | output_to_file=False, 38 | filepath=None, 39 | output_to_es_raw=False, 40 | output_to_es_register=False, 41 | push_to_redis=False, 42 | output_es_index=None, 43 | output_doc_type=None): 44 | result_lst = [] 45 | for url in url_lst: 46 | video_info = get_video_page(url=url) 47 | result_lst.append(video_info) 48 | logging.info('get_data at page %s' % url) 49 | if len(result_lst) >= 100: 50 | if output_es_index is not None and output_doc_type is not None: 51 | output_result(result_lst, 52 | platform='腾讯视频', 53 | output_to_file=output_to_file, 54 | output_to_es_raw=output_to_es_raw, 55 | output_to_es_register=output_to_es_register, 56 | push_to_redis=push_to_redis, 57 | es_index=output_es_index, 58 | doc_type=output_doc_type) 59 | result_lst.clear() 60 | else: 61 | output_result(result_lst, 62 | platform='腾讯视频', 63 | output_to_file=output_to_file, 64 | output_to_es_raw=output_to_es_raw, 65 | output_to_es_register=output_to_es_register, 66 | push_to_redis=push_to_redis) 67 | result_lst.clear() 68 | if len(result_lst) != []: 69 | if output_es_index is not None and output_doc_type is not None: 70 | output_result(result_lst, 71 | platform='腾讯视频', 72 | output_to_file=output_to_file, 73 | output_to_es_raw=output_to_es_raw, 74 | output_to_es_register=output_to_es_register, 75 | push_to_redis=push_to_redis, 76 | es_index=output_es_index, 77 | doc_type=output_doc_type) 78 | result_lst.clear() 79 | else: 80 | output_result(result_lst, 81 | platform='腾讯视频', 82 | output_to_file=output_to_file, 83 | output_to_es_raw=output_to_es_raw, 84 | output_to_es_register=output_to_es_register, 85 | push_to_redis=push_to_redis) 86 | result_lst.clear() 87 | 88 | if __name__ == '__main__': 89 | get_data_source(output_to_es_raw=True, 90 | output_es_index='test2', 91 | output_doc_type='fyc') -------------------------------------------------------------------------------- /tasks/crontab/192.168.18.11/etc/crontab: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | PATH=/sbin:/bin:/usr/sbin:/usr/bin 3 | MAILTO=hanye 4 | HOME=/ 5 | 6 | # For details see man 4 crontabs 7 | 8 | # Example of job definition: 9 | # .---------------- minute (0 - 59) 10 | # | .------------- hour (0 - 23) 11 | # | | .---------- day of month (1 - 31) 12 | # | | | .------- month (1 - 12) OR jan,feb,mar,apr ... 13 | # | | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat 14 | # | | | | | 15 | # * * * * * user-name command to be executed 16 | 17 | # 1 update video data in target release index daily 18 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -n 10 -s 10 19 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -n 30 -s 10 20 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p iqiyi -n 30 -s 10 21 | 0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -n 10 -s 10 22 | 23 | # 2 write crawled data into short-video-prodtion index daily 24 | 0 05 * * * hanye python3 /home/hanye/crawlers/tasks/update_DU_ATU_from_crawler_raw.py 25 | 26 | # 3 create redis url batch 27 | 0 18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30 28 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30 29 | 30 | # 4 scrap redis url list 31 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p 腾讯视频 -b 02 32 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p iqiyi -b 02 33 | 0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages.py -p youku -n 30 34 | 35 | # 5 scrap list pages 36 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p iqiyi -n 30 37 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p youku -n 30 38 | 0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p 腾讯视频 -n 30 39 | 0 7,10,15 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p toutiao -n 160 -s 10 40 | 41 | # update haokan target releaser 42 | 0 0,8,12,18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_single_thread.py -p haokan -n 20 43 | 44 | # get tencent news search page 45 | 0 0,4,8,10,14,16,20 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/search_page_single_process.py -p 腾讯新闻 46 | 47 | #high frequency releasers ,crawler executes every hour, only for haokan currently 48 | 0 0,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/high_fre_releasers.py -p haokan 49 | 50 | #high frequency releasers. execute on 1 pm and 6pm 51 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -fre 3 -n 20 -s 15 52 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -fre 3 -n 20 -s 15 53 | 0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -fre 3 -n 20 -s 15 54 | -------------------------------------------------------------------------------- /tasks/download_video_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Dec 14 15:03:05 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | from multiprocessing import Process 10 | from crawler.crawler_sys.utils import connect_with_redis 11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 12 | 13 | 14 | """ 15 | only for platform v_qq, iqiyi, and new_tudou 16 | """ 17 | 18 | step = 'download_video_page' 19 | data_cate = 'video_url' 20 | 21 | def download_video_page(platform): 22 | crawler_initialization = get_crawler(platform) 23 | crawler = crawler_initialization() 24 | if platform == '腾讯视频': 25 | key = 'v_qq_url_dict' 26 | else: 27 | key= "%s_%s" % (platform, data_cate) 28 | while True: 29 | if connect_with_redis.length_of_set(key) > 0: 30 | # pid_num = connect_with_redis.length_of_set(key=key_set) 31 | # if pid_num < 20: 32 | # process_num = int(20-pid_num) 33 | crawler.download_video_page_async_multi_process() 34 | # else: 35 | # print("%s processes is working on %s" % (pid_num, platform)) 36 | # time.sleep(20) 37 | else: 38 | print("no %s url [dict] in redis" % platform) 39 | time.sleep(300) 40 | 41 | v_qq = Process(target=download_video_page, args=('腾讯视频',)) 42 | iqiyi = Process(target=download_video_page, args=('iqiyi',)) 43 | new_tudou = Process(target=download_video_page, args=('new_tudou',)) 44 | 45 | v_qq.start() 46 | iqiyi.start() 47 | new_tudou.start() -------------------------------------------------------------------------------- /tasks/fix_album_play_count_by_getting_releaser_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 23 13:48:33 2018 4 | 5 | input video url, output video play_count gotten from releaser page 6 | 7 | @author: fangyucheng 8 | """ 9 | 10 | from elasticsearch import Elasticsearch 11 | from crawler.crawler_sys.utils.trans_format import lst_to_csv 12 | from crawler.crawler_sys.utils import trans_format 13 | from crawler.crawler_sys.site_crawler import crawler_v_qq 14 | 15 | absolute_file_path = r"C:\Users\zhouyujiang\安徽第一周数据情况.csv" 16 | task_list = trans_format.str_file_to_lst(absolute_file_path) 17 | 18 | result_lst = [] 19 | crawler = crawler_v_qq.Crawler_v_qq() 20 | for url in task_list: 21 | get_data = crawler.video_page(url) 22 | result_lst.append(get_data) 23 | print("get data at %s" % url) 24 | 25 | bug_releaser_list = [] 26 | releaserUrl_lst = [] 27 | revised_lst = [] 28 | for line in result_lst: 29 | try: 30 | if line['releaserUrl'] is not None: 31 | releaserUrl = line['releaserUrl'] 32 | if releaserUrl not in releaserUrl_lst: 33 | releaserUrl_lst.append(releaserUrl) 34 | try: 35 | crawler.releaser_page(releaserUrl, output_to_es_raw=True, 36 | es_index='test2', doc_type='12zjbfl', 37 | releaser_page_num_max=1000) 38 | print ("get releaser data at %s" % releaserUrl) 39 | except: 40 | bug_releaser_list.append(releaserUrl) 41 | else: 42 | pass 43 | else: 44 | print("this video %s can't find releaser" % line['url']) 45 | except: 46 | print("can't get releaser at %s" % url) 47 | 48 | hosts = '192.168.17.11' 49 | port = 80 50 | user_id = 'fangyucheng' 51 | password = 'VK0FkWf1fV8f' 52 | http_auth = (user_id, password) 53 | es_connection = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 54 | 55 | unsolve_lst = [] 56 | result_lst2 =[] 57 | 58 | for line in task_list: 59 | try: 60 | if type(line) == dict: 61 | url = line['url'] 62 | elif type(line) == str: 63 | url = line 64 | search_body = {"query": {"bool": {"filter": [{"term": {"url.keyword": url}}]}}} 65 | search = es_connection.search(index="test2", doc_type="12zjbfl", body=search_body) 66 | if search["hits"]["total"] == 0: 67 | unsolve_lst.append(url) 68 | print("can not get video data at %s" % url) 69 | else: 70 | video_data = search["hits"]["hits"][0]["_source"] 71 | result_lst2.append(video_data) 72 | print("get playcount at %s" % url) 73 | except: 74 | pass 75 | 76 | lst_to_csv(listname=result_lst2, 77 | csvname=r"C:\Users\zhouyujiang\12121212121.csv") -------------------------------------------------------------------------------- /tasks/follower_num.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/follower_num.csv -------------------------------------------------------------------------------- /tasks/from_es_find_crawler_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/6/21 14:33 3 | # @Author : litao 4 | -------------------------------------------------------------------------------- /tasks/parse_list_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Dec 14 10:32:28 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | from multiprocessing import Process 10 | from crawler.crawler_sys.utils import connect_with_redis 11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 12 | 13 | 14 | """ 15 | only for platform v_qq, iqiyi and youku 16 | """ 17 | 18 | step = 'parse_list_page' 19 | data_cate = 'list_page_html' 20 | 21 | def parse_list_page(platform): 22 | crawler_initialization = get_crawler(platform) 23 | crawler = crawler_initialization() 24 | if platform == '腾讯视频': 25 | platform = 'v_qq' 26 | key_lst = "%s_%s" % (platform, data_cate) 27 | # key_set = "%s_%s" % (platform, step) 28 | while True: 29 | if connect_with_redis.length_of_lst(key=key_lst) > 0: 30 | crawler.parse_list_page_multi_process() 31 | else: 32 | print("no %s list page html in redis" % platform) 33 | time.sleep(300) 34 | 35 | v_qq = Process(target=parse_list_page, args=('腾讯视频',)) 36 | iqiyi = Process(target=parse_list_page, args=('iqiyi',)) 37 | youku = Process(target=parse_list_page, args=('youku',)) 38 | 39 | v_qq.start() 40 | iqiyi.start() 41 | youku.start() 42 | -------------------------------------------------------------------------------- /tasks/parse_video_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Dec 14 15:23:29 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | from multiprocessing import Process 10 | from crawler.crawler_sys.utils import connect_with_redis 11 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 12 | 13 | """ 14 | for platform v_qq, iqiyi, and new_tudou 15 | """ 16 | 17 | kwarg_dict = {'output_to_es_raw': True, 18 | 'es_index': 'crawler-data-raw', 19 | 'doc_type': 'doc', 20 | 'output_to_es_register': True} 21 | 22 | step = 'parse_video_page' 23 | data_cate = 'video_page_html' 24 | 25 | def parse_video_page(platform, para_dic): 26 | crawler_initialization = get_crawler(platform) 27 | crawler = crawler_initialization() 28 | if platform == '腾讯视频': 29 | platform = 'v_qq' 30 | key= "%s_%s" % (platform, data_cate) 31 | while True: 32 | if connect_with_redis.length_of_lst(key) > 0: 33 | crawler.parse_video_page_multi_process(para_dic) 34 | else: 35 | print("no %s video page html in redis" % platform) 36 | time.sleep(300) 37 | 38 | v_qq = Process(target=parse_video_page, args=('腾讯视频', kwarg_dict)) 39 | iqiyi = Process(target=parse_video_page, args=('iqiyi', kwarg_dict)) 40 | new_tudou = Process(target=parse_video_page, args=('new_tudou', kwarg_dict)) 41 | 42 | v_qq.start() 43 | iqiyi.start() 44 | new_tudou.start() -------------------------------------------------------------------------------- /tasks/produce_releaser_report_from_es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/4/24 17:51 3 | # @Author : litao 4 | # 提供账号和平台,生成数据报告 5 | # 数据维度:发布量、播放量、粉丝量、评论量、点赞量,视频url、时长、发布时间、发布账号 6 | 7 | # import time 8 | import json 9 | # import argparse 10 | import datetime 11 | from elasticsearch import Elasticsearch 12 | import pandas as pd 13 | from elasticsearch.helpers import scan 14 | from func_find_week_num import find_week_belongs_to 15 | from crawler.crawler_sys.framework.platform_crawler_register import get_crawler 16 | from crawler.crawler_sys.utils import trans_format 17 | from func_cal_doc_id import cal_doc_id 18 | 19 | hosts = '192.168.17.11' 20 | port = 80 21 | user = 'zhouyujiang' 22 | passwd = '8tM9JDN2LVxM' 23 | http_auth = (user, passwd) 24 | 25 | es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) 26 | 27 | 28 | def func_search_reUrl_from_target_index(platform, releaser): 29 | search_body = { 30 | "query": { 31 | "bool": { 32 | "filter": [ 33 | {"term": {"platform.keyword": platform}}, 34 | {"term": {"releaser.keyword": releaser}} 35 | ] 36 | } 37 | } 38 | } 39 | search_re = es.search(index='target_releasers', doc_type='doc', body=search_body) 40 | if search_re['hits']['total'] > 0: 41 | return search_re['hits']['hits'][0]['_source']['releaserUrl'] 42 | else: 43 | print('Can not found:', platform, releaser) 44 | return None 45 | 46 | data_dic = {} 47 | miaopai_list = [] 48 | file = r'D:\work_file\无锡台内容数据需求.csv' 49 | with open(file, 'r')as f: 50 | header_Lst = f.readline().strip().split(',') 51 | for line in f: 52 | line_Lst = line.strip().split(',') 53 | line_dict = dict(zip(header_Lst, line_Lst)) 54 | releaser = line_dict['releaser'] 55 | platform = line_dict['platform'] 56 | # releaserUrl = func_search_reUrl_from_target_index(platform, releaser) 57 | releaserUrl = 1 58 | if releaserUrl != None: 59 | re_list = [] 60 | search_body = { 61 | "query": { 62 | "bool": { 63 | "filter": [ 64 | {"term": {"platform.keyword": platform}}, {"term": {"releaser.keyword": releaser}}, 65 | {"range": {"release_time": {"gte": 1546272000000, "lt": 1554048000000}}}, 66 | {"range": {"fetch_time": {"gte": 1556150400000}}} 67 | ] 68 | } 69 | } 70 | } 71 | 72 | scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc', 73 | query=search_body, scroll='3m') 74 | for one_scan in scan_re: 75 | "发布者,平台,标题,url,播放量,点赞量,评论量,时长,发布时间" 76 | data_dic[cal_doc_id(platform, url=one_scan["_source"]["url"], doc_id_type='all-time-url')]=[one_scan["_source"]["releaser"],one_scan["_source"]["platform"],one_scan["_source"]["title"],one_scan["_source"]["url"],one_scan["_source"]["play_count"],one_scan["_source"]["favorite_count"],one_scan["_source"]["comment_count"],one_scan["_source"]["duration"],datetime.datetime.fromtimestamp(one_scan["_source"]["release_time"]/1000).strftime('%Y-%m-%d %H:%M:%S')] 77 | data_lis = [] 78 | print(len(data_dic)) 79 | for d in data_dic: 80 | data_lis.append(data_dic[d]) 81 | 82 | data = pd.DataFrame(data_lis) 83 | data.to_csv('./%s.csv' % "无锡台内容数据需求2", encoding="ansi") 84 | 85 | 86 | -------------------------------------------------------------------------------- /tasks/sync_mayun_code_to_100.97.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2019/9/29 11:53 3 | # @Author : litao 4 | import subprocess,time 5 | 6 | f = open("/home/hanye/crawlersNew/crawler/tasks/log", "w", encoding="utf-8") 7 | while True: 8 | try: 9 | cmd0 = r'git pull origin master' 10 | cmd1 = r'git push gitLab master' 11 | cmdpath = r'/home/hanye/crawlersNew/v-scope2' 12 | process = subprocess.Popen(cmd0, shell=True, cwd=cmdpath) 13 | process.wait() 14 | result = process.returncode 15 | print(result,file=f) 16 | process = subprocess.Popen(cmd1, shell=True, cwd=cmdpath) 17 | process.wait() 18 | result = process.returncode 19 | print(result,file=f) 20 | #time.sleep(43200) 21 | time.sleep(70) 22 | except Exception as e: 23 | f.write("eroor",e) 24 | f.flush() 25 | 26 | f.close() 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /tasks/the_key_of_pid_in_redis.txt: -------------------------------------------------------------------------------- 1 | [parse_list_page] 2 | v_qq = v_qq_parse_list_page 3 | youku = youku_parse_list_page -------------------------------------------------------------------------------- /tasks/yangjingshu.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/yangjingshu.csv -------------------------------------------------------------------------------- /tasks/zhangminghui_dec.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec.csv -------------------------------------------------------------------------------- /tasks/zhangminghui_dec2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litaolemo/crawler/fbfe3c4feca8be61186aec986b600b36f513f7f4/tasks/zhangminghui_dec2.csv -------------------------------------------------------------------------------- /test/qq_news/remove_unrelated_parameter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 26 13:14:03 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import time 9 | import urllib 10 | import requests 11 | 12 | domain = 'http://r.inews.qq.com/searchMore?' 13 | #domain_simple = 'http://r.inews.qq.com/searchMore' 14 | headers = {"Host": "r.inews.qq.com", 15 | "Accept-Encoding": "gzip,deflate", 16 | "Referer": "http://inews.qq.com/inews/android/", 17 | "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)", 18 | "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;", 19 | "Connection": "Keep-Alive"} 20 | 21 | url_dic = {'isoem':'0', 22 | 'mid': '74b9305504a047ab0a1901e2dfbf71f87f799819', 23 | 'dpi': '270', 24 | 'devid': '008796749793280', 25 | 'is_chinamobile_oem': '0', 26 | 'mac': 'mac%20unknown', 27 | 'real_device_width':'5.06', 28 | 'store': '17', 29 | 'screen_height': '1440', 30 | 'real_device_height': '9.0', 31 | 'apptype': 'android', 32 | 'origin_imei': '008796749793280', 33 | 'orig_store': '17', 34 | 'hw': 'etease_MuMu', 35 | 'appver': '23_android_5.4.10', 36 | 'uid': '54767d8bf41ac9a4', 37 | 'screen_width':'810', 38 | 'sceneid':'', 39 | 'omgid': '818b2ebf4abcec4bc1c8bf737a1c131dede60010213210', 40 | 'timeline':'1540802766', 41 | 'query':'espn', 42 | 'activefrom': 'icon', 43 | 'qqnetwork': 'wifi', 44 | 'rom_type': '', 45 | 'secId': '2', 46 | 'Cookie':'lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;', 47 | 'network_type': 'wifi', 48 | 'id': '20181029A18Y3H00', 49 | 'global_info': '1|0|0|0|1|1|1|1|0|6|1|1|1|1|0|J060P000000000:B054P000011803|1402|0|0|-1|-1|0|0|0||-1|-1|0|0|1|1|0|0|-1|0|2|0|2|0|0|0|0|0|0|0|0|2|0|0|0|0', 50 | 'imsi_history':'0,460013199570862', 51 | 'omgbizid': 'a520b26ce7880445ab488481e3dd4949c74f0050213210', 52 | 'qn-rid': '40ff49fc-e6ee-4384-8a7c-9ee507d57e47', 53 | 'qn-sig': 'c022edce8ae72f053304412f13a9bb88', 54 | 'page': "2", 55 | 'type': "0", 56 | 'imsi': '460013199570862'} 57 | 58 | #sig = 'c022edce8ae72f053304412f13a9bb88' 59 | 60 | url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic) 61 | 62 | """ 63 | raw-url 64 | 'http://r.inews.qq.com/searchMore?isoem=0&mid=74b9305504a047ab0a1901e2dfbf71f87f799819&dpi=270&devid=008796749793280&is_chinamobile_oem=0&mac=mac%2520unknown&real_device_width=5.06&store=17&screen_height=1440&real_device_height=9.0&apptype=android&origin_imei=008796749793280&orig_store=17&hw=etease_MuMu&appver=23_android_5.4.10&uid=54767d8bf41ac9a4&screen_width=810&sceneid=&omgid=818b2ebf4abcec4bc1c8bf737a1c131dede60010213210&timeline=1540802766&query=espn&activefrom=icon&qqnetwork=wifi&rom_type=&secId=2&Cookie=lskey%3D%3Bluin%3D%3Bskey%3D%3Buin%3D%3B+logintype%3D0%3B+main_login%3Dqq%3B&network_type=wifi&id=20181029A18Y3H00&global_info=1%7C0%7C0%7C0%7C1%7C1%7C1%7C1%7C0%7C6%7C1%7C1%7C1%7C1%7C0%7CJ060P000000000%3AB054P000011803%7C1402%7C0%7C0%7C-1%7C-1%7C0%7C0%7C0%7C%7C-1%7C-1%7C0%7C0%7C1%7C1%7C0%7C0%7C-1%7C0%7C2%7C0%7C2%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C2%7C0%7C0%7C0%7C0&imsi_history=0%2C460013199570862&omgbizid=a520b26ce7880445ab488481e3dd4949c74f0050213210&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88&page=2&type=0&imsi=460013199570862' 65 | """ 66 | 67 | get_page = requests.get(url, headers=headers) 68 | page = get_page.text 69 | 70 | key_lst = [] 71 | para_lst = [] 72 | 73 | for key, value in url_dic.items(): 74 | key_lst.append(key) 75 | 76 | for key in key_lst: 77 | value = url_dic[key] 78 | url_dic.pop(key) 79 | url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(url_dic) 80 | get_page = requests.get(url, headers=headers) 81 | page = get_page.text 82 | time.sleep(10) 83 | if len(page) > 5000: 84 | print("%s can be moved from url, length of page is %s" % (key, len(page))) 85 | continue 86 | else: 87 | url_dic[key] = value 88 | print("key %s, value %s can't be moved from url" % (key, value)) 89 | 90 | #this the result 91 | final_url_dic = {'devid': '008796749793280', 92 | 'appver': '23_android_5.4.10', 93 | 'query': 'espn', 94 | 'qn-rid': '3e3cb605-3a00-412d-8b2e-f81b32f5064c', 95 | 'qn-sig': '589a406e354aa3bfb6ddeaa778278ef1'} 96 | final_url = 'http://r.inews.qq.com/searchMore?%s' % urllib.parse.urlencode(final_url_dic) 97 | 98 | """ 99 | final_url 100 | 'http://r.inews.qq.com/searchMore?devid=008796749793280&appver=23_android_5.4.10&query=espn&qn-rid=40ff49fc-e6ee-4384-8a7c-9ee507d57e47&qn-sig=c022edce8ae72f053304412f13a9bb88' 101 | """ 102 | 103 | qn_sig = "01552dc75351b12d01b5310441a562e2" 104 | qn_rid = "8f14a809-db96-4762-905a-7c063dc84ac7" 105 | -------------------------------------------------------------------------------- /test/test_read_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 4 16:23:48 2018 4 | 5 | @author: fangyucheng 6 | """ 7 | 8 | import configparser 9 | 10 | config = configparser.ConfigParser() 11 | config.read('/home/hanye/crawlerNEW/crawler/crawler_sys/framework/config/search_keywords.ini') 12 | --------------------------------------------------------------------------------