├── codes ├── apks │ ├── __init__.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── folder_path.py │ │ ├── page_content_pipeline.py │ │ ├── apk_download_pipeline.py │ │ └── image_download_pipeline.py │ ├── images │ │ ├── android.png │ │ ├── folder.png │ │ ├── search.png │ │ ├── version.png │ │ └── folder_import.png │ ├── utils.py │ ├── middlewares.py │ ├── spiders │ │ ├── __init__.py │ │ ├── github_spider.py │ │ ├── fossdroid_spider.py │ │ ├── opensource_spider.py │ │ ├── xiaomi_spider.py │ │ └── apkpure_spider.py │ ├── items.py │ ├── statistic.py │ ├── update_hash.py │ ├── main.py │ ├── apk_filter.py │ ├── get_apk_info.py │ ├── settings.py │ ├── copy_data_incrementally.py │ ├── crawler_gui.py │ ├── ui_thread.py │ ├── database.py │ ├── main_gui.py │ └── data_gui.py ├── tools │ ├── __init__.py │ ├── add_customer_app.py │ ├── update_information.py │ ├── timing.sh │ └── log_analysis.py ├── requirement.txt └── scrapy.cfg ├── log └── README.md ├── data └── README.md ├── pictures ├── gui.png ├── apkpure.png ├── github.png ├── xiaomi.png └── fossdroid.png ├── documents ├── 使用手册.pdf ├── patch.2020-08-20.sql ├── pictures │ ├── image-20210322180446897.png │ ├── image-20210617174018715.png │ ├── image-20210617174458840.png │ ├── image-20210617212251905.png │ └── image-20210617212601123.png ├── patch.2020-11-26.sql ├── patch.2020-08-20-2.sql ├── README.md ├── my.cnf ├── patch.2020-08-09.sql ├── patch.2020-11-02.sql ├── patch.2020-10-30.sql ├── patch.2020-12-15.sql ├── 使用手册.md └── apk_merge.sql ├── LICENSE ├── README.md └── .gitignore /codes/apks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /codes/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /log/README.md: -------------------------------------------------------------------------------- 1 | # `Log` Folder 2 | 存储爬虫运行过程中存储的日志 -------------------------------------------------------------------------------- /codes/tools/add_customer_app.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # `Data` Folder 2 | 存储下载后的apk文件及其相关的描述和图片 -------------------------------------------------------------------------------- /pictures/gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/pictures/gui.png -------------------------------------------------------------------------------- /codes/apks/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from .folder_path import get_file_size 3 | -------------------------------------------------------------------------------- /codes/requirement.txt: -------------------------------------------------------------------------------- 1 | pymysql 2 | scrapy 3 | click 4 | image 5 | python-crontab 6 | pyqt5 7 | sip -------------------------------------------------------------------------------- /codes/tools/update_information.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | def update_information(): 4 | pass -------------------------------------------------------------------------------- /documents/使用手册.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/使用手册.pdf -------------------------------------------------------------------------------- /pictures/apkpure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/pictures/apkpure.png -------------------------------------------------------------------------------- /pictures/github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/pictures/github.png -------------------------------------------------------------------------------- /pictures/xiaomi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/pictures/xiaomi.png -------------------------------------------------------------------------------- /pictures/fossdroid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/pictures/fossdroid.png -------------------------------------------------------------------------------- /codes/apks/images/android.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/codes/apks/images/android.png -------------------------------------------------------------------------------- /codes/apks/images/folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/codes/apks/images/folder.png -------------------------------------------------------------------------------- /codes/apks/images/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/codes/apks/images/search.png -------------------------------------------------------------------------------- /codes/apks/images/version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/codes/apks/images/version.png -------------------------------------------------------------------------------- /codes/apks/images/folder_import.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/codes/apks/images/folder_import.png -------------------------------------------------------------------------------- /documents/patch.2020-08-20.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE `apk_merge`.`update` 2 | ADD COLUMN `apk_hash` binary(32) NULL COMMENT 'apk sha256值' AFTER `is_download`; -------------------------------------------------------------------------------- /documents/pictures/image-20210322180446897.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/pictures/image-20210322180446897.png -------------------------------------------------------------------------------- /documents/pictures/image-20210617174018715.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/pictures/image-20210617174018715.png -------------------------------------------------------------------------------- /documents/pictures/image-20210617174458840.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/pictures/image-20210617174458840.png -------------------------------------------------------------------------------- /documents/pictures/image-20210617212251905.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/pictures/image-20210617212251905.png -------------------------------------------------------------------------------- /documents/pictures/image-20210617212601123.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RiskySignal/APKCrawler/HEAD/documents/pictures/image-20210617212601123.png -------------------------------------------------------------------------------- /codes/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = apks.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = apks 12 | -------------------------------------------------------------------------------- /documents/patch.2020-11-26.sql: -------------------------------------------------------------------------------- 1 | 2 | ALTER TABLE `apk_merge`.`app_type` COMMENT = '应用类型表'; 3 | 4 | ALTER TABLE `apk_merge`.`developer` COMMENT = '开发者表'; 5 | 6 | ALTER TABLE `apk_merge`.`update` COMMENT = '应用版本更新表'; 7 | 8 | ALTER TABLE `apk_merge`.`update` 9 | ADD COLUMN `is_delete` bit(1) NOT NULL DEFAULT 0 COMMENT '删除标记' AFTER `update_date`; -------------------------------------------------------------------------------- /codes/tools/timing.sh: -------------------------------------------------------------------------------- 1 | # 每月3号中午12.30启动xiaomi爬虫 2 | 30 12 3 * * python3 ~/Desktop/auto_apk_merge/crawler/codes/apks/main.py --market_name xiaomi 3 | 4 | 5 | # 每月13号中午12.30启动fossdroid爬虫 6 | 30 12 13 * * python3 ~/Desktop/auto_apk_merge/crawler/codes/apks/main.py --market_name fossdroid 7 | 8 | 9 | # 每月23号中午12.30启动apkpure爬虫 10 | 30 12 23 * * python3 ~/Desktop/auto_apk_merge/crawler/codes/apks/main.py --market_name apkpure -------------------------------------------------------------------------------- /codes/apks/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import hashlib 3 | 4 | 5 | def cal_file_hash(file_path: str): 6 | sha256_value = hashlib.sha256() 7 | with open(file_path, 'rb') as _apk_file_: 8 | while True: 9 | data_flow = _apk_file_.read(8096) 10 | if not data_flow: 11 | break 12 | sha256_value.update(data_flow) 13 | sha256_value = sha256_value.hexdigest() 14 | return sha256_value 15 | -------------------------------------------------------------------------------- /documents/patch.2020-08-20-2.sql: -------------------------------------------------------------------------------- 1 | DROP PROCEDURE IF EXISTS `apk_merge`.`set_update_available`; 2 | 3 | delimiter ;; 4 | CREATE 5 | DEFINER = `root`@`localhost` PROCEDURE `set_update_available`(IN `update_id_in` int unsigned, IN `size_in` char(20), IN `hash_in` char(64)) 6 | BEGIN 7 | UPDATE `update` 8 | SET is_download= TRUE, 9 | size=size_in, 10 | apk_hash=UNHEX(hash_in) 11 | WHERE update_id = update_id_in; 12 | END;; 13 | delimiter ; -------------------------------------------------------------------------------- /codes/apks/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import random 4 | 5 | import settings 6 | 7 | 8 | class UserAgentMiddleware: 9 | logger = logging.getLogger("User Agent Middleware") 10 | 11 | def process_request(self, request, spider): 12 | random_ua = random.choice(settings.USER_AGENT_LIST) 13 | request.headers['User-Agent'] = random_ua 14 | if spider.settings["USING_PROXY"]: 15 | request.meta['proxy'] = settings.PROXY_PATH 16 | -------------------------------------------------------------------------------- /codes/apks/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | from .apkpure_spider import ApkPureSpider 6 | from .fossdroid_spider import FossDroidSpider 7 | from .xiaomi_spider import XiaomiSpider 8 | from .github_spider import GithubSpider 9 | from .opensource_spider import OpenSourceSpider 10 | 11 | __all__ = ['ApkPureSpider', 'FossDroidSpider', 'XiaomiSpider', 'GithubSpider', 'OpenSourceSpider'] 12 | -------------------------------------------------------------------------------- /documents/README.md: -------------------------------------------------------------------------------- 1 | # `Document` Folder 2 | 3 | 存储数据库配置、数据库sql脚本及其他文档文件. 4 | 5 | - `apk_merge.sql` 数据库sql脚本. 6 | - `patch.2020-08-09.sql` 数据库sql脚本2020-08-09日补丁, 已在`apk_merge.sql`中修复该漏洞. 7 | - `patch.2020-08-20.sql` 数据库sql脚本2020-08-20日补丁, 已在`apk_merge.sql`中修复该漏洞. 8 | - `patch.2020-08-20-2.sql` 数据库sql脚本2020-08-20日补丁2,解决前一个补丁新增apk_hash字段导致的新增数据报错问题, 已在`apk_merge.sql`中修复该漏洞. 9 | - `patch.2020-10-30.sql` 数据库sql脚本2020-10-30补丁,添加新的字段,已在`apk_merge.sql`中修复该漏洞 10 | - `patch.2020-11-02.sql` 数据库sql脚本2020-11-02补丁,添加并更新存储过程,已在`apk_merge.sql`中修复该漏洞 11 | - `patch.2020-12-15.sql` 数据库sql脚本2020-12-15补丁, 添加了从文件导入apk的存储过程, 并设置链接字段为 "可为空"的, 已在 `apk_merge.sql` 中修复该漏洞 -------------------------------------------------------------------------------- /codes/apks/items.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | import scrapy 8 | 9 | 10 | class AppDetail(scrapy.Item): 11 | app_title = scrapy.Field() # app 名字 12 | apk_name = scrapy.Field() # apk 包名 13 | description = scrapy.Field() # 介绍 14 | developer = scrapy.Field() # 开发者 15 | app_link = scrapy.Field() # 网页链接 16 | category = scrapy.Field() # 类别 17 | market = scrapy.Field() # 应用市场 18 | version = scrapy.Field() # 版本号 19 | picture_links = scrapy.Field() # 截图链接 20 | size = scrapy.Field() # apk大小 21 | download_link = scrapy.Field() # 下载地址 22 | update_id = scrapy.Field() # update id 23 | picture_link_ids = scrapy.Field() # 截图id 24 | update_date = scrapy.Field() # update date 25 | -------------------------------------------------------------------------------- /codes/apks/pipelines/folder_path.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | 4 | __current_folder__ = os.path.dirname(__file__) 5 | 6 | 7 | def get_app_folder(item): 8 | market = item['market'] 9 | apk_name = item['apk_name'] 10 | version = item['version'] 11 | return os.path.join(os.path.dirname(__file__), "../../../data", market, apk_name, version) 12 | 13 | 14 | def get_file_size(file_path: str): 15 | size = os.path.getsize(file_path) 16 | 17 | def strofsize(integer, remainder, level): 18 | if integer >= 1024: 19 | remainder = integer % 1024 20 | integer //= 1024 21 | level += 1 22 | return strofsize(integer, remainder, level) 23 | else: 24 | return integer, remainder, level 25 | 26 | units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB'] 27 | integer, remainder, level = strofsize(size, 0, 0) 28 | if level + 1 > len(units): 29 | level = -1 30 | return ('{}.{:>03d} {}'.format(integer, remainder, units[level])) 31 | -------------------------------------------------------------------------------- /codes/tools/log_analysis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | def read_log_file(log_file): 4 | crawled_url = [] 5 | with open(log_file, 'r') as _file_: 6 | for line in _file_.readlines(): 7 | if "scrapy.core.engine" in line: 8 | infos = line.split() 9 | for info in infos: 10 | if 'http' in info: 11 | info = info[:-1] 12 | crawled_url.append(info) 13 | 14 | return crawled_url 15 | 16 | 17 | def log_analysis(log_file_a, log_file_b): 18 | """ find url in log file b but not in log file a.""" 19 | crawled_url_a = read_log_file(log_file_a) 20 | crawled_url_b = read_log_file(log_file_b) # type: list 21 | 22 | for url in crawled_url_b: 23 | if url not in crawled_url_a: 24 | print(url) 25 | 26 | 27 | if __name__ == '__main__': 28 | file_a = "../../log/" + "1580663423.txt" 29 | file_b = "../../log/" + "1580710260.txt" 30 | 31 | log_analysis(file_a, file_b) 32 | -------------------------------------------------------------------------------- /documents/my.cnf: -------------------------------------------------------------------------------- 1 | # 2 | # The MySQL database server configuration file. 3 | # 4 | # You can copy this to one of: 5 | # - "/etc/mysql/my.cnf" to set global options, 6 | # - "~/.my.cnf" to set user-specific options. 7 | # 8 | # One can use all long options that the program supports. 9 | # Run program with --help to get a list of available options and with 10 | # --print-defaults to see which it would actually understand and use. 11 | # 12 | # For explanations see 13 | # http://dev.mysql.com/doc/mysql/en/server-system-variables.html 14 | 15 | # 16 | # * IMPORTANT: Additional settings that can override those from this file! 17 | # The files must end with '.cnf', otherwise they'll be ignored. 18 | # 19 | 20 | !includedir /etc/mysql/conf.d/ 21 | !includedir /etc/mysql/mysql.conf.d/ 22 | 23 | [client] 24 | default-character-set = utf8 25 | 26 | [mysqld] 27 | character-set-server = utf8 28 | server_id = 1 29 | log-bin=mysql-bin.log 30 | innodb_autoinc_lock_mode = 0 31 | #init_connect='SET NAMES utf8' 32 | 33 | [mysql] 34 | default-character-set = utf8 35 | 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ceres 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /codes/apks/pipelines/page_content_pipeline.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import os 9 | import time 10 | 11 | from items import AppDetail 12 | from database import Database 13 | from pipelines.folder_path import get_app_folder 14 | from settings import DEFAULT_CATEGORY, DEFAULT_SIZE, DEFAULT_VERSION 15 | 16 | 17 | class ContentPipeline(object): 18 | 19 | def __init__(self): 20 | """ 21 | init func 22 | """ 23 | # init database source 24 | self.db_handler = Database() 25 | 26 | def process_item(self, item: AppDetail, spider): 27 | item.setdefault("description", None) 28 | item.setdefault("category", DEFAULT_CATEGORY) 29 | item.setdefault("size", DEFAULT_SIZE) 30 | item.setdefault("version", DEFAULT_VERSION) 31 | item.setdefault("picture_links", []) 32 | item.setdefault("picture_link_ids", []) 33 | item.setdefault('update_id', None) 34 | if item['update_date'] is None: 35 | item['update_date'] = time.strftime("%Y-%m-%d", time.localtime()) 36 | 37 | app_folder = get_app_folder(item) 38 | if not os.path.exists(app_folder): 39 | os.makedirs(app_folder) 40 | 41 | # 保存app介绍 42 | if item['description']: 43 | des_file = os.path.join(app_folder, "description.txt") 44 | with open(des_file, 'w') as _file_: 45 | _file_.write(item["description"]) 46 | 47 | # 保存app和update信息 48 | self.db_handler.insert_app(item) 49 | 50 | return item 51 | -------------------------------------------------------------------------------- /codes/apks/statistic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | 4 | from database import Database 5 | 6 | 7 | def statistic(): 8 | print("统计时间:" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) 9 | db_handler = Database() 10 | 11 | market_names = ['xiaomi', 'fossdroid', 'apkpure'] 12 | 13 | # app number 14 | app_numbers = [] 15 | for market_name in market_names: 16 | app_number = db_handler.get_app_number(market_name) 17 | app_numbers.append(app_number) 18 | 19 | # update number 20 | update_numbers = [] 21 | for market_name in market_names: 22 | update_number = db_handler.get_update_number(market_name) 23 | update_numbers.append(update_number) 24 | 25 | # available update number 26 | available_update_numbers = [] 27 | for market_name in market_names: 28 | available_update_number = db_handler.get_available_update_number(market_name) 29 | available_update_numbers.append(available_update_number) 30 | 31 | # diff type update number 32 | diff_type_names, diff_type_update_numbers = db_handler.get_diff_type_update_number() 33 | 34 | # print 35 | print_string = "" 36 | print_string += "已扫描的APP数量:\n\t" 37 | for _i in range(len(market_names)): 38 | print_string += "{}-{}\t\t".format(market_names[_i], app_numbers[_i]) 39 | print_string += "\n" 40 | 41 | print_string += "已扫描的UPDATE数量:\n\t" 42 | for _i in range(len(market_names)): 43 | print_string += "{}-{}\t\t".format(market_names[_i], update_numbers[_i]) 44 | print_string += "\n" 45 | 46 | print_string += "已下载的UPDATE数量:\n\t" 47 | for _i in range(len(market_names)): 48 | print_string += "{}-{}\t\t".format(market_names[_i], available_update_numbers[_i]) 49 | print_string += "\n" 50 | 51 | print_string += "各分类的UPDATE数量:\n\t" 52 | for _i in range(len(diff_type_names)): 53 | print_string += "'{}'-{} ".format(diff_type_names[_i], diff_type_update_numbers[_i]) 54 | print_string += "\n" 55 | 56 | print(print_string) 57 | 58 | 59 | if __name__ == '__main__': 60 | statistic() 61 | -------------------------------------------------------------------------------- /codes/apks/update_hash.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from database import Database 3 | from settings import DEFAULT_SIZE, FILES_STORE 4 | from pipelines import get_file_size 5 | import os 6 | import logging 7 | from utils import cal_file_hash 8 | import pymysql 9 | 10 | 11 | def update_hash(): 12 | db_handler = Database() 13 | cursor = db_handler.get_cursor() 14 | 15 | sql_str = "select update_id, market_name, apk_name, version, size from `update` " \ 16 | "inner join app a on `update`.app_id = a.app_id " \ 17 | "inner join market b on a.market_id = b.market_id " \ 18 | "where is_download=TRUE and apk_hash is NULL;" 19 | update_hash_sql_str = "update `update` set apk_hash=unhex(%s) " \ 20 | "where update_id=%s;" 21 | update_size_sql_str = "update `update` set size=%s " \ 22 | "where update_id=%s;" 23 | 24 | try: 25 | # 获取全部update信息 26 | cursor.execute(sql_str) 27 | results = cursor.fetchall() 28 | 29 | for result in results: 30 | apk_path = os.path.join(FILES_STORE, result['market_name'].decode('utf-8'), result['apk_name'].decode("utf-8"), result['version'].decode('utf-8'), result['apk_name'].decode('utf-8')) 31 | if not os.path.exists(apk_path): 32 | logging.warning("Apk {} not found, but it had been downloaded.".format(apk_path)) 33 | continue 34 | 35 | # 将sha256存储到数据库 36 | sha256_value = cal_file_hash(apk_path) 37 | 38 | cursor.execute( 39 | update_hash_sql_str, 40 | (sha256_value, result['update_id']) 41 | ) 42 | 43 | # 更新apk size 44 | if result['size'].decode('utf-8') == DEFAULT_SIZE: 45 | apk_size = get_file_size(apk_path) 46 | 47 | cursor.execute( 48 | update_size_sql_str, 49 | (apk_size, result['update_id']) 50 | ) 51 | except pymysql.Error as _err: 52 | db_handler.db.rollback() 53 | raise _err 54 | else: 55 | db_handler.db.commit() 56 | 57 | print("Done!") 58 | finally: 59 | db_handler.close() 60 | 61 | 62 | if __name__ == '__main__': 63 | update_hash() 64 | -------------------------------------------------------------------------------- /codes/apks/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import click 3 | import time 4 | import os 5 | 6 | from scrapy.crawler import CrawlerProcess 7 | from scrapy.utils.project import get_project_settings 8 | from spiders import * 9 | 10 | 11 | @click.command() 12 | @click.option("--market_name", "-m", default="fossdroid", type=click.Choice(["fossdroid", "xiaomi", "apkpure", "github", "github_opensource"], case_sensitive=False), help="Market Name in ['fossdroid', 'xiaomi', 'apkpure', 'github', 'github_opensource']. Default is fossdroid.") 13 | @click.option("--using_log_file", "-l", expose_value=True, is_flag=True, help="Whether use log file to save the log information, default is False.") 14 | @click.option("--log_level", "-v", default="INFO", type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR']), help="Log level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']. Default is INFO") 15 | @click.option("--using_proxy", "-u", default=False, type=bool, is_flag=True, help="Whether use proxy server on 127.0.0.1:10809.") 16 | def main(market_name: str = "fossdroid", using_log_file: bool = False, log_level: str = "INFO", using_proxy: bool = False): 17 | if market_name == "xiaomi": 18 | spider = XiaomiSpider 19 | elif market_name == "fossdroid": 20 | spider = FossDroidSpider 21 | elif market_name == "apkpure": 22 | spider = ApkPureSpider 23 | elif market_name == "github": 24 | spider = GithubSpider 25 | elif market_name == "github_opensource": 26 | spider = OpenSourceSpider 27 | else: 28 | raise ValueError("Market Name Error.") 29 | 30 | start_time = str(int(time.time())) 31 | cur_folder = os.path.dirname(__file__) 32 | log_folder = os.path.join(cur_folder, "../../log/") 33 | if not os.path.exists(log_folder): 34 | os.makedirs(log_folder, exist_ok=True) 35 | 36 | process = CrawlerProcess(get_project_settings()) 37 | if using_log_file: 38 | log_file = os.path.join(log_folder, "{}.{}.txt".format(start_time, market_name)) 39 | process.settings.set('LOG_FILE', log_file) # for developer environment 40 | if using_proxy: 41 | process.settings.set("USING_PROXY", True) 42 | process.settings.set('LOG_LEVEL', log_level) 43 | process.crawl(spider) 44 | process.start() 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /documents/patch.2020-08-09.sql: -------------------------------------------------------------------------------- 1 | DROP PROCEDURE IF EXISTS `apk_merge`.`insert_app_update`; 2 | 3 | delimiter ;; 4 | CREATE 5 | DEFINER = `root`@`localhost` PROCEDURE `insert_app_update`(IN `title_in` char(255), IN `name_in` char(255), IN `app_link_in` varchar(1023), 6 | IN `developer_in` VARCHAR(255), IN `type_in` varchar(255), IN `market_in` varchar(255), 7 | IN `version_in` char(255), IN `size_in` char(20), IN `download_link_in` varchar(1023)) 8 | BEGIN 9 | # declare local variables 10 | DECLARE local_app_link_id INT UNSIGNED; 11 | DECLARE local_download_link_id INT UNSIGNED; 12 | DECLARE local_market_id TINYINT UNSIGNED; 13 | DECLARE local_type_id SMALLINT UNSIGNED; 14 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 15 | DECLARE local_app_id INT UNSIGNED; 16 | 17 | # save the link 18 | INSERT IGNORE INTO link(href) VALUES (app_link_in), (download_link_in); 19 | SELECT link_id INTO local_app_link_id FROM link WHERE href = app_link_in; 20 | SELECT link_id INTO local_download_link_id FROM link WHERE href = download_link_in; 21 | 22 | # save the marget 23 | INSERT IGNORE INTO market(market_name) VALUES (market_in); 24 | SELECT market_id INTO local_market_id FROM market WHERE market_name = market_in; 25 | 26 | # save the type 27 | INSERT IGNORE INTO app_type(type_name) VALUES (type_in); 28 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name = type_in; 29 | 30 | # save the developer 31 | INSERT IGNORE INTO developer(developer_name) VALUES (developer_in); 32 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name = developer_in; 33 | 34 | # save the app 35 | INSERT IGNORE INTO app(app_title, apk_name, app_link_id, developer_id, type_id, market_id) 36 | VALUES (title_in, name_in, local_app_link_id, local_developer_id, local_type_id, local_market_id) 37 | ON DUPLICATE KEY 38 | UPDATE app_title=title_in; 39 | SELECT app_id INTO local_app_id FROM app WHERE apk_name = name_in AND market_id = local_market_id; 40 | 41 | # save the update 42 | INSERT IGNORE INTO `update`(app_id, version, size, download_link_id) VALUES (local_app_id, version_in, size_in, local_download_link_id); 43 | SELECT update_id FROM `update` WHERE app_id = local_app_id AND version = version_in; 44 | END 45 | ;; 46 | delimiter ; -------------------------------------------------------------------------------- /codes/apks/pipelines/apk_download_pipeline.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import os 4 | 5 | import scrapy 6 | from scrapy.exceptions import DropItem 7 | from scrapy.pipelines.files import FilesPipeline 8 | from .folder_path import get_file_size 9 | import settings as project_settings 10 | from items import AppDetail 11 | from utils import cal_file_hash 12 | from database import Database 13 | from pipelines.folder_path import get_app_folder 14 | 15 | 16 | class ApkDownloadPipeline(FilesPipeline): 17 | logger = logging.getLogger("ApkDownloadPipeline") 18 | 19 | def __init__(self, store_uri, download_func=None, settings=None): 20 | super(ApkDownloadPipeline, self).__init__(store_uri, download_func, settings) 21 | self.db_handler = Database() 22 | 23 | def get_media_requests(self, item: AppDetail, info): 24 | app_folder = get_app_folder(item) 25 | download_link = item['download_link'] 26 | apk_name = item['apk_name'] 27 | 28 | file_path = os.path.join(app_folder, apk_name) 29 | if item['market'] == "github_opensource": 30 | file_path += ".zip" 31 | elif not file_path.endswith('.apk'): 32 | file_path += '.apk' 33 | file_path = os.path.relpath(file_path, project_settings.FILES_STORE) 34 | if not self.db_handler.get_update_status(item['update_id']): 35 | yield scrapy.Request(download_link, meta={'file_path': file_path}) 36 | else: 37 | raise DropItem("Apk File {} exists.".format(download_link)) 38 | 39 | def file_path(self, request, response=None, info=None, *, item=None): 40 | return request.meta['file_path'] 41 | 42 | def item_completed(self, results, item: AppDetail, info): 43 | if results[0][0]: 44 | # download successfully 45 | self.logger.info("Download app '{}' version '{}' from market '{}' successfully.".format(item['app_title'], item['version'], item['market'])) 46 | apk_path = results[0][1]['path'] 47 | apk_path = os.path.join(project_settings.FILES_STORE, apk_path) 48 | apk_size = get_file_size(apk_path) 49 | apk_hash = cal_file_hash(apk_path) 50 | self.db_handler.set_update_available(item['update_id'], apk_size, apk_hash) 51 | return item 52 | else: 53 | # download fail 54 | self.logger.error("Fail to Download app '{}' version '{}' from market '{}'.".format(item['app_title'], item['version'], item['market'])) 55 | return item 56 | -------------------------------------------------------------------------------- /codes/apks/pipelines/image_download_pipeline.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from scrapy.pipelines.images import ImagesPipeline 4 | from items import AppDetail 5 | from scrapy.utils.python import to_bytes 6 | import hashlib 7 | import scrapy 8 | import os 9 | from pipelines.folder_path import get_app_folder 10 | import settings as project_settings 11 | from database import Database 12 | 13 | 14 | class ImageDownloadPipeline(ImagesPipeline): 15 | logger = logging.getLogger("ImageDownloadPipeline") 16 | 17 | def __init__(self, store_uri, download_func=None, settings=None): 18 | super().__init__(store_uri, download_func, settings) 19 | self.db = Database() 20 | 21 | def get_media_requests(self, item: AppDetail, info): 22 | app_folder = get_app_folder(item) 23 | file_path = os.path.relpath(app_folder, project_settings.FILES_STORE) 24 | 25 | image_length = len(item['picture_links']) 26 | pruned_picture_links = [] 27 | pruned_picture_link_ids = [] 28 | for _image_index_ in range(image_length): 29 | picture_link = item['picture_links'][_image_index_] 30 | picture_link_id = item['picture_link_ids'][_image_index_] 31 | if not self.db.get_image_status(picture_link_id): 32 | pruned_picture_links.append(picture_link) 33 | pruned_picture_link_ids.append(picture_link_id) 34 | else: 35 | logging.info("Image file {} exists.".format(picture_link)) 36 | item['picture_links'] = pruned_picture_links 37 | item['picture_link_ids'] = pruned_picture_link_ids 38 | 39 | for picture_link in item['picture_links']: 40 | yield scrapy.Request(picture_link, meta={'file_path': file_path}) 41 | 42 | def file_path(self, request, response=None, info=None, *, item=None): 43 | image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() 44 | return os.path.join(request.meta['file_path'], "%s.jpg" % image_guid) 45 | 46 | def item_completed(self, results, item: AppDetail, info): 47 | for result_index in range(len(results)): 48 | result = results[result_index] 49 | if result[0]: 50 | self.logger.info("Download image '{}' successfully.".format(item['picture_links'][result_index])) 51 | self.db.set_image_available(item['picture_link_ids'][result_index]) 52 | else: 53 | self.logger.error("Fail to download image '{}'.".format(item['picture_links'][result_index])) 54 | 55 | return item 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawler for APP Platform 2 | 3 | 项目中抽离出来的 apk 爬虫模块,支持 [ApkPure](https://apkpure.com/) , [Github](https://github.com/search?q=apk&type=commits) , [Xiaomi](https://app.mi.com/) , [FossDroid](https://fossdroid.com/) . 4 | 5 | 整个爬虫采用 Scrapy + Mysql 对目标平台上的apk增量进行爬取,大家对这些平台按需爬取,不要影响这些平台的正常运作 :blush: . 6 | 7 | 8 | ## 目录结构 9 | 10 | - `codes` 目录下为主要的代码 11 | - `documents` 目录下存储必要的sql文件和其他必要的说明文件 12 | - `data` 目录下存储爬取后的apk文件 13 | - `log` 目录下存储爬取过程中的log日志 14 | - `pictures` 目录下存储图片 15 | 16 | ## 拷贝项目 17 | 18 | 1. 拷贝项目: 19 | 20 | $ cd workspace/ 21 | $ git clone https://github.com/RiskySignal/APKCrawler 22 | $ cd crawl_for_apk_merge/ 23 | 24 | ## 数据库Mysql配置 25 | 26 | 1. 安装Mysql: 详细可参照 [这里](https://wangxin1248.github.io/linux/2018/07/ubuntu18.04-install-mysqlserver.html), 仅需完成 **安装 Mysql** 和 **配置 Mysql** . 27 | 2. 确认在 **当前** 用户下可以通过命令 `mysql -u root -p` 登录mysql, 程序中默认的密码为 `123456` . 28 | 3. 初始化数据库: 29 | 30 | 其中复制 my.cnf 这一步不是必须的,你可以自己配置自己的mysql中的字符集为 `utf-8`. 31 | 32 | $ cd crawl_for_apk_merge/ 33 | $ sudo cp ./documents/my.cnf /etc/mysql/my.cnf 34 | $ sudo service mysql restart 35 | $ mysql -u root -p 36 | 37 | mysql> create database apk_merge; 38 | mysql> source ~/workspace/crawl_for_apk_merge/documents/apk_merge.sql # 替换项目的路径 39 | 40 | ## 爬虫搭建 41 | 42 | 1. 需要的 python 版本为 python3. 43 | 2. 进入代码文件夹 `cd ./crawl_for_apk_merge/codes`. 44 | 3. 安装依赖包 `pip install -r requirement.txt`. 45 | 4. 爬取 apk `python3 main.py --help` 可以查看具体的用法,爬取的过程主要与服务器的下载速度和 Market 的 Apk 数量相关. 46 | 5. 直接使用 gui 界面, `python3 main_gui.py` , 在 windows 上不能使用定时器. 47 | > 使用 GUI 前, 请配置 setting.py 最后两项. 48 | 49 | ## 下载统计及其他脚本 50 | 51 | - 下载统计: `python3 statistic.py` 可以查看当前扫描的app数量、apk数量、各类型的数量. 52 | - 筛选apk: `python3 filter_apk.py --help` 可以根据平台和apk大小筛选包. 53 | - 根据apk获取相应信息: `python3 get_apk_info.py --help` 给定apk安装包位置,获取其相应信息. 54 | - 定时任务 timing.sh 设置定时任务 55 | - ~~修复数据库中apk安装包hash值缺失问题: `python3 update_hash.py` 重新计算已有apk的hash并存入数据库 (该hash已经全部入库,正常配置数据库的情况下不再需要该脚本)~~. 56 | - ~~修复数据库自增id空洞问题: `python3 optimize_database.py` 自动修复空洞问题 (该bug已经通过上述的mysql配置解决,正常配置数据库的情况下不再需要该脚本)~~. 57 | 58 | ## 运行截图 59 | 60 | 1. github 61 | ![github](./pictures/github.png) 62 | 63 | 2. xiaomi 64 | ![xiaomi](./pictures/xiaomi.png) 65 | 66 | 3. Fossdroid 67 | ![fossdroid](./pictures/fossdroid.png) 68 | 69 | 4. ApkPure 70 | ![apkpure](./pictures/apkpure.png) 71 | 72 | 5. GUI界面 73 | ![image-20201215194718576](pictures/gui.png) 74 | 75 | ## Todo 76 | - ~~写一个简单的QT GUI界面~~ -> 实现了集成 增删查, 定时器, 爬虫在内的界面; 77 | - 扩展爬虫平台 78 | - 将代理的设置加入 GUI 界面 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # idea 132 | .idea/ 133 | 134 | # data 135 | data/* 136 | !data/README.md 137 | 138 | # log 139 | log/* 140 | !log/README.md 141 | 142 | # documents 143 | documents/*.docx 144 | documents/*.doc 145 | documents/*.xlsx 146 | !documents/*.sql 147 | 148 | # words 149 | ~* 150 | output/ 151 | codes/apks/.DS_Store 152 | *.DS_Store 153 | -------------------------------------------------------------------------------- /documents/patch.2020-11-02.sql: -------------------------------------------------------------------------------- 1 | 2 | ALTER TABLE `apk_merge`.`update` 3 | ADD COLUMN `update_date` datetime NULL COMMENT '更新日期' AFTER `sdk_level`; 4 | 5 | ALTER TABLE `apk_merge`.`authority` 6 | MODIFY COLUMN `authority_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '权限名称' AFTER `authority_id`; 7 | 8 | ALTER TABLE `apk_merge`.`link` 9 | MODIFY COLUMN `href` varchar(1023) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '网址' AFTER `link_id`; 10 | 11 | DROP PROCEDURE IF EXISTS `insert_app_update`; 12 | delimiter ;; 13 | CREATE PROCEDURE `insert_app_update`(IN `title_in` char(255),IN `name_in` char(255),IN `app_link_in` varchar(1023),IN `developer_in` VARCHAR(255),IN `type_in` varchar(255),IN `market_in` varchar(255),IN `version_in` char(255),IN `size_in` char(20),IN `download_link_in` varchar(1023),IN `update_date_in` DATETIME) 14 | BEGIN 15 | # declare local variables 16 | DECLARE local_app_link_id INT UNSIGNED; 17 | DECLARE local_download_link_id INT UNSIGNED; 18 | DECLARE local_market_id TINYINT UNSIGNED; 19 | DECLARE local_type_id SMALLINT UNSIGNED; 20 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 21 | DECLARE local_app_id INT UNSIGNED; 22 | 23 | # save the link 24 | INSERT IGNORE INTO link(href) VALUES(app_link_in), (download_link_in); 25 | SELECT link_id INTO local_app_link_id FROM link WHERE href=app_link_in; 26 | SELECT link_id INTO local_download_link_id FROM link WHERE href=download_link_in; 27 | 28 | # save the marget 29 | INSERT IGNORE INTO market(market_name) VALUES(market_in); 30 | SELECT market_id INTO local_market_id FROM market WHERE market_name=market_in; 31 | 32 | # save the type 33 | INSERT IGNORE INTO app_type(type_name) VALUES(type_in); 34 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name=type_in; 35 | 36 | # save the developer 37 | INSERT IGNORE INTO developer(developer_name) VALUES(developer_in); 38 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name=developer_in; 39 | 40 | # save the app 41 | INSERT IGNORE INTO app(app_title, apk_name, app_link_id, developer_id, type_id, market_id) VALUES(title_in, name_in, local_app_link_id, local_developer_id, local_type_id, local_market_id) 42 | ON DUPLICATE KEY 43 | UPDATE app_title=title_in, app_link_id=local_app_link_id, developer_id=local_developer_id, type_id=local_type_id; 44 | SELECT app_id INTO local_app_id FROM app WHERE apk_name=name_in AND market_id=local_market_id; 45 | 46 | # save the update 47 | INSERT IGNORE INTO `update`(app_id, version, size, download_link_id, update_date) VALUES(local_app_id, version_in, size_in, local_download_link_id, update_date_in) 48 | ON DUPLICATE KEY 49 | UPDATE size=size_in, download_link_id=local_download_link_id, update_date=update_date_in; 50 | SELECT update_id FROM `update` WHERE app_id=local_app_id AND version=version_in; 51 | END 52 | ;; 53 | delimiter ; -------------------------------------------------------------------------------- /documents/patch.2020-10-30.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `apk_merge`.`authority` 2 | ( 3 | `authority_id` smallint UNSIGNED NOT NULL AUTO_INCREMENT, 4 | `authority_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, 5 | PRIMARY KEY (`authority_id`), 6 | UNIQUE INDEX `authority_name` (`authority_name`) USING BTREE COMMENT '权限名称唯一' 7 | ) COMMENT = '权限名称表'; 8 | 9 | CREATE TABLE `authority_relation` 10 | ( 11 | `authority_relation_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '权限关系id', 12 | `update_id` int UNSIGNED NOT NULL COMMENT 'update id', 13 | `authority_id` smallint UNSIGNED NOT NULL COMMENT 'authority id', 14 | PRIMARY KEY (`authority_relation_id`) USING BTREE, 15 | INDEX `authority_relation_update_id` (`update_id`) USING BTREE, 16 | INDEX `authority_relation_authority_id` (`authority_id`) USING BTREE, 17 | CONSTRAINT `authority_relation_authority_id` FOREIGN KEY (`authority_id`) REFERENCES `authority` (`authority_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 18 | CONSTRAINT `authority_relation_update_id` FOREIGN KEY (`update_id`) REFERENCES `update` (`update_id`) ON DELETE RESTRICT ON UPDATE RESTRICT 19 | ) ENGINE = InnoDB 20 | CHARACTER SET = utf8 21 | COLLATE = utf8_general_ci COMMENT = 'update 对应 authority的关系' 22 | ROW_FORMAT = Dynamic; 23 | 24 | ALTER TABLE `apk_merge`.`update` 25 | ADD COLUMN `malware` bit(1) NULL DEFAULT 0 COMMENT '应用是否为恶意软件' AFTER `apk_hash`, 26 | ADD COLUMN `obfuscation` bit(1) NULL DEFAULT 0 COMMENT '应用是否为加固混淆应用' AFTER `malware`, 27 | ADD COLUMN `sdk_level` char(8) CHARACTER SET ascii COLLATE ascii_general_ci NULL COMMENT 'sdk level' AFTER `obfuscation`; 28 | 29 | DROP PROCEDURE IF EXISTS `insert_authority_relation`; 30 | delimiter ;; 31 | CREATE PROCEDURE `insert_authority_relation`(IN `hash_in` CHAR(64),IN `authority_name_in` varchar(255)) 32 | BEGIN 33 | # declare local variables 34 | DECLARE local_update_id INT UNSIGNED; 35 | DECLARE local_authority_id SMALLINT UNSIGNED; 36 | DECLARE done INT DEFAULT 0; 37 | DECLARE report CURSOR FOR SELECT update_id FROM `update` WHERE apk_hash=UNHEX(hash_in); 38 | DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=1; 39 | 40 | # save the authority 41 | INSERT IGNORE INTO authority(authority_name) VALUES(authority_name_in); 42 | SELECT authority_id INTO local_authority_id FROM authority WHERE authority_name=authority_name_in; 43 | 44 | # get the update_id 45 | OPEN report; # open the cursor 46 | FETCH report INTO local_update_id; 47 | WHILE done<>1 DO 48 | INSERT IGNORE INTO authority_relation(update_id, authority_id) VALUES(local_update_id, local_authority_id); 49 | FETCH report INTO local_update_id; 50 | END WHILE; 51 | CLOSE report; # close the cursor 52 | 53 | END 54 | ;; 55 | delimiter ; -------------------------------------------------------------------------------- /codes/apks/spiders/github_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | 4 | import scrapy 5 | import logging 6 | import items 7 | 8 | 9 | class GithubSpider(scrapy.Spider): 10 | name = "github" 11 | logger = logging.getLogger("GithubSpider") 12 | 13 | def start_requests(self): 14 | start_url = "https://github.com/search?q=APK&type=Commits" 15 | yield scrapy.Request(start_url, callback=self.parse_list) 16 | 17 | def parse_list(self, response): 18 | commit_urls = response.css('div#commit_search_results div.commit a.sha.btn::attr("href")').getall() 19 | for commit_url in commit_urls: 20 | yield scrapy.Request( 21 | response.urljoin(commit_url), callback=self.parse_commit 22 | ) 23 | 24 | # parse next list 25 | next_list_url = response.css('a.next_page::attr("href")').get() 26 | if next_list_url: 27 | next_list_url = response.urljoin(next_list_url) 28 | yield scrapy.Request( 29 | response.urljoin(next_list_url), callback=self.parse_list 30 | ) 31 | 32 | def parse_commit(self, response): 33 | for commit_file_dom in response.css("div#files div.file"): # type: scrapy.Selector 34 | file_name = commit_file_dom.css("a.link-gray-dark::attr('title')").get() # type: str 35 | file_url = commit_file_dom.css('details-menu.dropdown-menu a.btn-link::attr("href")').get() # type: str 36 | if file_name.endswith(".apk"): # likely to be a apk file 37 | yield scrapy.Request( 38 | response.urljoin(file_url), callback=self.parse_file 39 | ) 40 | 41 | def parse_file(self, response): 42 | # download url 43 | download_url = response.url.replace("/blob/", '/raw/') 44 | 45 | # author name 46 | author = response.css("div.application-main span.author a::text").get() 47 | if not author: 48 | raise ValueError("Developer Name error.") 49 | app_link = response.url 50 | market = "github" 51 | 52 | # project name 53 | project_name = response.css('main div.flex-auto strong.flex-self-stretch a::text').get() 54 | app_title = project_name 55 | apk_name = app_title 56 | 57 | # version 58 | file_name = os.path.splitext(os.path.basename(response.url))[0] 59 | path_list = response.url.split('/') # type: list 60 | hash_index = path_list.index("blob") + 1 61 | sub_version_str = "" 62 | commit_hash = path_list[hash_index] 63 | for path in path_list[hash_index + 1:-1]: 64 | sub_version_str += "." + path[:2] 65 | version = file_name + "." + commit_hash[:8] 66 | if sub_version_str != "": 67 | version += sub_version_str 68 | version = version[:255] 69 | 70 | # update time 71 | update_date = response.css('span.d-none.d-md-inline relative-time::attr("datetime")').get() # type: str 72 | if update_date: 73 | update_date = update_date.replace('T', " ").replace('Z', "") 74 | 75 | # description 76 | description = response.css("main div.repository-content div.Box div.Details-content--hidden pre::text").get() 77 | 78 | # size 79 | size = response.css("main div.repository-content div.Box div.Box-header div.text-mono.f6::text").get().strip() 80 | yield items.AppDetail(app_title=app_title, apk_name=apk_name, description=description, developer=author, app_link=app_link, market=market, version=version, size=size, download_link=download_url, update_date=update_date) 81 | -------------------------------------------------------------------------------- /codes/apks/apk_filter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import click 3 | import os 4 | import shutil 5 | import random 6 | import glob 7 | from collections import defaultdict 8 | 9 | import database 10 | 11 | 12 | @click.command() 13 | @click.argument("target_folder") 14 | @click.option("--market", type=click.Choice(['all', 'xiaomi', 'fossdroid', 'apkpure'], case_sensitive=False), default="all", help="Which market of apps to be filtered. Default is all.") 15 | @click.option("--apk_size", type=int, default=100, help="The max threshold of apk size in MB. Default is 100MB, if it's 0 then not limit the apk size.") 16 | @click.option("--app_num_per_type", type=int, default=100, help="The max number of apps in filtered folder for each app type. Default is 100, if it's 0 then not limit the number.") 17 | def apk_filt(target_folder: str, market: str = "all", apk_size: int = 100, app_num_per_type: int = 100): 18 | if apk_size <= 0: 19 | limit_apk_size = False 20 | else: 21 | limit_apk_size = True 22 | 23 | if app_num_per_type <= 0: 24 | limit_type_num = False 25 | else: 26 | limit_type_num = True 27 | 28 | if market == "all": 29 | market_folders = ["../../data/xiaomi", "../../data/fossdroid", "../../data/apkpure"] 30 | else: 31 | market_folders = ["../../data/{}".format(market)] 32 | __current_folder__ = os.path.dirname(__file__) 33 | for _index_ in range(len(market_folders)): 34 | market_folders[_index_] = os.path.join(__current_folder__, market_folders[_index_]) 35 | 36 | type_app_dict = defaultdict(list) 37 | db_handler = database.Database() 38 | _mb_ = float(1024 * 1024) 39 | for market_folder in market_folders: 40 | market_name = os.path.basename(market_folder) 41 | 42 | for app_folder in glob.glob(os.path.join(market_folder, "*")): 43 | apk_name = os.path.basename(app_folder) 44 | 45 | for update_folder in glob.glob(os.path.join(app_folder, "*")): 46 | version_name = os.path.basename(update_folder) 47 | 48 | if limit_apk_size: 49 | apk_path = os.path.join(update_folder, apk_name) 50 | if os.path.exists(apk_path) and os.path.getsize(apk_path) / _mb_ <= apk_size: 51 | type_name = db_handler.get_app_type(market_name, apk_name) 52 | output_type_folder = os.path.join(target_folder, type_name) 53 | if not os.path.exists(output_type_folder): 54 | os.makedirs(output_type_folder) 55 | output_apk_path = os.path.join(output_type_folder, "{}-{}".format(version_name, apk_name)) 56 | type_app_dict[type_name].append((output_apk_path, apk_path)) 57 | 58 | filter_number_dict = defaultdict(lambda: 0) 59 | for type_key in type_app_dict.keys(): 60 | type_app_list = type_app_dict[type_key] 61 | 62 | if limit_type_num and len(type_app_list) > app_num_per_type: 63 | tmp_type_app_list = [] 64 | random_index = random.sample(range(len(type_app_list)), app_num_per_type) 65 | for _index_ in random_index: 66 | tmp_type_app_list.append(type_app_list[_index_]) 67 | type_app_list = tmp_type_app_list 68 | 69 | for output_apk_path, apk_path in type_app_list: 70 | filter_number_dict[type_key] += 1 71 | shutil.copy(apk_path, output_apk_path) 72 | 73 | print("Filter number for different app type: " + str(dict(filter_number_dict))) 74 | print("Done!") 75 | 76 | 77 | if __name__ == '__main__': 78 | apk_filt() 79 | -------------------------------------------------------------------------------- /documents/patch.2020-12-15.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE `apk_merge`.`update` 2 | MODIFY COLUMN `download_link_id` int UNSIGNED NULL COMMENT '下载链接id' AFTER `size`; 3 | 4 | ALTER TABLE `apk_merge`.`app` 5 | MODIFY COLUMN `app_link_id` int UNSIGNED NULL COMMENT 'app详解链接地址id' AFTER `apk_name`; 6 | 7 | DROP PROCEDURE IF EXISTS `apk_merge`.`insert_app_update`; 8 | delimiter ;; 9 | CREATE DEFINER=`root`@`localhost` PROCEDURE `insert_app_update`(IN `title_in` char(255),IN `name_in` char(255),IN `app_link_in` varchar(1023),IN `developer_in` VARCHAR(255),IN `type_in` varchar(255),IN `market_in` varchar(255),IN `version_in` char(255),IN `size_in` char(20),IN `download_link_in` varchar(1023),IN `update_date_in` DATETIME) 10 | BEGIN 11 | # declare local variables 12 | DECLARE local_app_link_id INT UNSIGNED; 13 | DECLARE local_download_link_id INT UNSIGNED; 14 | DECLARE local_market_id TINYINT UNSIGNED; 15 | DECLARE local_type_id SMALLINT UNSIGNED; 16 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 17 | DECLARE local_app_id INT UNSIGNED; 18 | 19 | # save the link 20 | INSERT IGNORE INTO link(href) VALUES(app_link_in), (download_link_in); 21 | SELECT link_id INTO local_app_link_id FROM link WHERE href=app_link_in; 22 | SELECT link_id INTO local_download_link_id FROM link WHERE href=download_link_in; 23 | 24 | # save the market 25 | INSERT IGNORE INTO market(market_name) VALUES(market_in); 26 | SELECT market_id INTO local_market_id FROM market WHERE market_name=market_in; 27 | 28 | # save the type 29 | INSERT IGNORE INTO app_type(type_name) VALUES(type_in); 30 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name=type_in; 31 | 32 | # save the developer 33 | INSERT IGNORE INTO developer(developer_name) VALUES(developer_in); 34 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name=developer_in; 35 | 36 | # save the app 37 | INSERT IGNORE INTO app(app_title, apk_name, app_link_id, developer_id, type_id, market_id) VALUES(title_in, name_in, local_app_link_id, local_developer_id, local_type_id, local_market_id) 38 | ON DUPLICATE KEY 39 | UPDATE app_title=title_in, app_link_id=local_app_link_id, developer_id=local_developer_id, type_id=local_type_id; 40 | SELECT app_id INTO local_app_id FROM app WHERE apk_name=name_in AND market_id=local_market_id; 41 | 42 | # save the update 43 | INSERT IGNORE INTO `update`(app_id, version, size, download_link_id, update_date) VALUES(local_app_id, version_in, size_in, local_download_link_id, update_date_in) 44 | ON DUPLICATE KEY 45 | UPDATE size=size_in, download_link_id=local_download_link_id, update_date=update_date_in; 46 | SELECT update_id FROM `update` WHERE app_id=local_app_id AND version=version_in; 47 | END 48 | ;; 49 | delimiter ; 50 | 51 | DROP PROCEDURE IF EXISTS `apk_merge`.`insert_app_from_file`; 52 | delimiter ;; 53 | CREATE DEFINER=`root`@`localhost` PROCEDURE `insert_app_from_file`(IN `title_in` char(255),IN `name_in` char(255),IN `developer_in` VARCHAR(255),IN `type_in` varchar(255),IN `market_in` varchar(255),IN `version_in` char(255),IN `size_in` char(20),IN `update_date_in` DATETIME,IN `apk_hash_in` CHAR(64)) 54 | BEGIN 55 | # declare local variables 56 | DECLARE local_market_id TINYINT UNSIGNED; 57 | DECLARE local_type_id SMALLINT UNSIGNED; 58 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 59 | DECLARE local_app_id INT UNSIGNED; 60 | 61 | # save the market 62 | INSERT IGNORE INTO market(market_name) VALUES(market_in); 63 | SELECT market_id INTO local_market_id FROM market WHERE market_name=market_in; 64 | 65 | # save the type 66 | INSERT IGNORE INTO app_type(type_name) VALUES(type_in); 67 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name=type_in; 68 | 69 | # save the developer 70 | INSERT IGNORE INTO developer(developer_name) VALUES(developer_in); 71 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name=developer_in; 72 | 73 | # save the app 74 | INSERT IGNORE INTO app(app_title, apk_name, developer_id, type_id, market_id) VALUES(title_in, name_in, local_developer_id, local_type_id, local_market_id); 75 | SELECT app_id INTO local_app_id FROM app WHERE apk_name=name_in AND market_id=local_market_id; 76 | 77 | # save the update 78 | INSERT IGNORE INTO `update`(app_id, version, size, is_download, apk_hash, update_date) VALUES(local_app_id, version_in, size_in, TRUE, UNHEX(apk_hash_in), update_date_in); 79 | END 80 | ;; 81 | delimiter ; -------------------------------------------------------------------------------- /codes/apks/spiders/fossdroid_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | 4 | import scrapy 5 | 6 | import items 7 | import settings 8 | 9 | 10 | class FossDroidSpider(scrapy.Spider): 11 | name = "FossDroid Spider" 12 | 13 | def start_requests(self): 14 | start_url = "https://fossdroid.com/" 15 | yield scrapy.Request(start_url, callback=self.parse_homepage) 16 | 17 | def parse_homepage(self, response): 18 | """ 19 | parse homepage 20 | """ 21 | # 不同类别 22 | type_urls = response.css("nav.mdl-navigation.mdl-color--white a::attr('href')").getall() 23 | 24 | diff_sort_names = ["whats_new.html", "trending.html", "most_popular.html"] 25 | for type_url in type_urls: 26 | for diff_sort_name in diff_sort_names: 27 | new_url = response.urljoin(type_url) + diff_sort_name 28 | yield scrapy.Request(new_url, callback=self.parse_list) 29 | 30 | def parse_list(self, response): 31 | """ 32 | parse app list 33 | """ 34 | # 不同app 35 | app_urls = response.css("main.mdl-layout__content div.fd-list_applications div.fd-application div.mdl-card__actions a::attr('href')").getall() 36 | 37 | for app_url in app_urls: 38 | new_url = response.urljoin(app_url) 39 | yield scrapy.Request(new_url, callback=self.parse) 40 | 41 | def parse(self, response, **kwargs): 42 | """ 43 | parse app 44 | """ 45 | # app title 46 | app_title = response.css("main.mdl-layout__content div#fd-section_container section.fd-section div.mdl-card h1::text").get() 47 | if not app_title: 48 | raise ValueError('App Title Error.') 49 | 50 | # description 51 | description = "".join([line.strip() for line in response.css("main.mdl-layout__content div#fd-section_container section.fd-section div.mdl-card div.mdl-card__supporting-text::text").getall()]) 52 | 53 | # version 54 | try: 55 | app_version = response.css("main.mdl-layout__content div#fd-section_container section.fd-section div.mdl-card div.mdl-card__supporting-text div.fd-application_info::text").getall()[1].split(':')[1].strip() 56 | if not app_version: 57 | raise ValueError('App Version Error.') 58 | except IndexError: 59 | raise ValueError('App Version Error.') 60 | 61 | # update date 62 | try: 63 | update_date = response.css("main.mdl-layout__content div#fd-section_container section.fd-section div.mdl-card div.mdl-card__supporting-text div.fd-application_info::text").getall()[5].split(':')[1].strip() 64 | if not update_date: 65 | raise ValueError("App Update Date Error.") 66 | except IndexError: 67 | raise ValueError("App Update Date Error.") 68 | else: 69 | day, month, year = update_date.split('-') 70 | update_date = year + "-" + month + "-" + day 71 | 72 | # pictures 73 | picture_links = [response.urljoin(picture_link) for picture_link in response.css('main.mdl-layout__content div#fd-section_container section.fd-section div.mdl-card div.mdl-card__supporting-text img::attr(src)').getall()] 74 | 75 | # download url 76 | download_url = response.urljoin(response.css("main.mdl-layout__content div#fd-section_container section.fd-section ul li:nth-child(6) a::attr('href')").get()) 77 | if not download_url: 78 | raise ValueError("Download Link Error.") 79 | 80 | # app url 81 | app_link = response.url 82 | 83 | # apk name 84 | apk_name = os.path.basename(download_url) 85 | 86 | # type 87 | app_type = response.css('header.mdl-layout__header div.mdl-layout__header-row span.fd-breadcrumb a::text').get().lower() 88 | if not app_type: 89 | raise ValueError('App Type Error.') 90 | 91 | # market 92 | market = "fossdroid" 93 | 94 | app_detail = items.AppDetail( 95 | app_title=app_title, apk_name=apk_name, description=description, developer=settings.DEFAULT_DEVELOPER, app_link=app_link, category=app_type, market=market, version=app_version, picture_links=picture_links, size=settings.DEFAULT_SIZE, download_link=download_url, update_date=update_date 96 | ) 97 | 98 | yield app_detail 99 | -------------------------------------------------------------------------------- /codes/apks/get_apk_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | import os 4 | import time 5 | import subprocess 6 | from database import Database 7 | from utils import cal_file_hash 8 | 9 | 10 | def get_apk_info(apk_path, print_info=True): 11 | """ 12 | :return: [{"app_title":.., "type_name":.., "market_name":.., "href":.., "language":..},...] 13 | """ 14 | if not os.path.exists(apk_path) or not os.path.isfile(apk_path): 15 | raise FileNotFoundError("The apk {} is not existing.".format(apk_path)) 16 | 17 | sha256_value = cal_file_hash(apk_path) 18 | sql_str = "select app_title, type_name, market_name, href from app " \ 19 | "inner join `update` u on app.app_id = u.app_id " \ 20 | "inner join market m on m.market_id=app.market_id " \ 21 | "inner join app_type a on app.type_id = a.type_id " \ 22 | "inner join link l on app.app_link_id = l.link_id " \ 23 | "where apk_hash=unhex(%s);" 24 | db_handler = Database() 25 | cursor = db_handler.get_cursor() 26 | cursor.execute( 27 | sql_str, 28 | sha256_value 29 | ) 30 | results = cursor.fetchall() 31 | 32 | for result in results: 33 | result['app_title'] = result['app_title'].decode('utf-8') 34 | result['type_name'] = result['type_name'].decode('utf-8') 35 | result['market_name'] = result['market_name'].decode('utf-8') 36 | result['href'] = result['href'].decode('utf-8') 37 | result['language'] = "中文" if result['market_name'] == "xiaomi" else "English" 38 | 39 | if print_info: 40 | print("APK file: {}\nSHA256 Value: {}".format(apk_path, sha256_value)) 41 | if results: 42 | print("Find {} information about it.".format(len(results))) 43 | 44 | for result in results: 45 | print("-" * 64) 46 | print("App Title: {app_title}\nType Name: {type_name}\nMarket Name: {market_name}\nHref: {href}\nLanguage: {language}".format(**result)) 47 | print("-" * 64) 48 | else: 49 | print("Not found any crawler information about it.") 50 | 51 | return results 52 | 53 | 54 | if __name__ == '__main__': 55 | parser = argparse.ArgumentParser(description="Get apks info.") 56 | parser.add_argument("apk_path", type=str, help="The apks folder path.") 57 | arg_infos = parser.parse_args() 58 | 59 | apks_path = arg_infos.apk_path 60 | apks = [] 61 | apk_paths = [] 62 | apk_sizes = [] 63 | for root, dirs, files in os.walk(apks_path): 64 | for f in files: 65 | if f.endswith('.apk'): 66 | apks.append(os.path.join(root,f)) 67 | 68 | timenow = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) 69 | logfile = open('../../log/apk_info_%s.log'%(timenow),'a+') 70 | log = [] 71 | for apk in apks: 72 | ###### Get Crawler info ###### 73 | results = get_apk_info(apk) 74 | if results: 75 | print("Find {} information about it.".format(len(results))) 76 | for result in results: 77 | print("-" * 64) 78 | print("App Title: {app_title}\nType Name: {type_name}\nMarket Name: {market_name}\nHref: {href}\nLanguage: {language}".format(**result)) 79 | log.append("-" * 64 + '\n') 80 | log.append("App Title: {app_title}\nType Name: {type_name}\nMarket Name: {market_name}\nHref: {href}\nLanguage: {language}\n".format(**result)) 81 | else: 82 | log.append("No Crawler info of APK "+apk+'\n') 83 | print("Not found any crawler information about it.") 84 | ###### Get SDK and permission info ###### 85 | command = 'aapt list -a '+ apk +' > apkversion.txt' 86 | ret = subprocess.call(command, shell=True) 87 | if ret == 0: 88 | # find targetSdkVersion in apkversion.txt 89 | # example : android:targetSdkVersion(0x0101020c)=(type 0x10)0xf - 0xf - 15 90 | sdk_info = 'android:targetSdkVersion' 91 | permission_info = 'android.permission' 92 | fp = open('apkversion.txt','r') 93 | lines = fp.readlines() 94 | fp.close() 95 | flines = len(lines) 96 | print('=== Processing APK '+apk+' ===') 97 | log.append('=== SDK and Permission info of APK '+apk+' ===\n') 98 | for i in range(flines): 99 | if sdk_info in lines[i]: 100 | sdkversion = lines[i].rsplit(')')[-1].strip() 101 | print('targetSdkVersion:'+sdkversion) 102 | log.append('targetSdkVersion:'+sdkversion+'\n') 103 | if permission_info in lines[i]: 104 | permission = lines[i].rsplit('\"')[-2] 105 | print('uses-permission:'+permission) 106 | log.append('uses-permission:'+permission+'\n') 107 | else: 108 | print('[AAPT PROCESS WARNING]'+apk) 109 | log.append("-" * 64 + '\n') 110 | logfile.writelines(log) 111 | 112 | if os.path.exists('apkversion.txt'): 113 | os.remove('apkversion.txt') 114 | 115 | -------------------------------------------------------------------------------- /codes/apks/spiders/opensource_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import scrapy 3 | import logging 4 | import items 5 | from settings import crawler_key_words 6 | 7 | 8 | class OpenSourceSpider(scrapy.Spider): 9 | name = "Github Open Source" 10 | logger = logging.getLogger("OpenSourceSpider") 11 | 12 | def start_requests(self): 13 | # define keywords 14 | keywords = crawler_key_words 15 | 16 | start_url = "https://github.com/search?q={}&type=Repositories" 17 | for keyword in keywords: 18 | yield scrapy.Request(start_url.format(keyword), callback=self.parse_list) 19 | 20 | def parse_list(self, response): 21 | repository_urls = response.css("div.application-main ul.repo-list li.repo-list-item div.f4 a::attr('href')").getall() 22 | 23 | for repository_url in repository_urls: 24 | yield scrapy.Request( 25 | response.urljoin(repository_url), callback=self.parse_repository 26 | ) 27 | 28 | # parse next list 29 | next_list_url = response.css('a.next_page::attr("href")').get() 30 | if next_list_url: 31 | next_list_url = response.urljoin(next_list_url) 32 | yield scrapy.Request( 33 | response.urljoin(next_list_url), callback=self.parse_list 34 | ) 35 | 36 | def parse_repository(self, response): 37 | # author name 38 | author = response.css("div.application-main span.author a::text").get() 39 | if not author: 40 | raise ValueError("Developer Name Error.") 41 | app_link = response.url 42 | 43 | # project name 44 | project_name = response.css('main div.flex-auto strong.flex-self-stretch a::text').get() 45 | 46 | # update time 47 | update_date = response.css("relative-time::attr('datetime')").get() 48 | if not update_date: 49 | date_time = response.css("time-ago::attr('datetime')").getall() 50 | if len(date_time) > 0: 51 | update_date = date_time[0] 52 | for _index_ in range(1, len(date_time)): 53 | update_date = date_time[_index_] if date_time[_index_] > update_date else update_date 54 | try: 55 | update_date = update_date.replace('T', ' ').replace('Z', ' ') 56 | except Exception: 57 | logging.warning("Load Update Info Error for {}".format(response.url)) 58 | yield scrapy.Request(response.url, callback=self.parse_repository) 59 | else: 60 | # description 61 | description = response.css("div.repository-content div.BorderGrid div.BorderGrid-cell p::text").get() 62 | if not description: 63 | description = "No Description." 64 | else: 65 | description = description.strip() 66 | 67 | # download url 68 | zip_url = response.css("div.repository-content details.details-overlay ul.list-style-none li.Box-row a::attr('href')").getall()[1] 69 | zip_url = response.urljoin(zip_url) 70 | 71 | info = { 72 | "author": author, 73 | "app_link": app_link, 74 | "project_name": project_name, 75 | "update_date": update_date, 76 | "description": description, 77 | "download_url": zip_url 78 | } 79 | 80 | yield scrapy.Request( 81 | response.url, callback=self.parse_folder_check, meta=info 82 | ) 83 | 84 | def parse_folder_check(self, response): 85 | # check whether a android project 86 | svg_labels = response.css("div.repository-content div.Details div.Box-row svg::attr('aria-label')").getall() 87 | files_and_folders = response.css("div.repository-content div.Details div.Box-row div[role='rowheader'] a::text").getall() 88 | assert len(svg_labels) == len(files_and_folders) 89 | is_file = [svg_label == "File" for svg_label in svg_labels] 90 | is_android_project = False 91 | for _index_ in range(len(files_and_folders)): 92 | if is_file[_index_] and 'build.gradle' == files_and_folders[_index_]: 93 | is_android_project = True 94 | 95 | if is_android_project: 96 | app_link = response.meta['app_link'] 97 | project_name = response.meta['project_name'] 98 | app_title = apk_name = project_name 99 | 100 | yield items.AppDetail(app_title=app_title, apk_name=apk_name, description=response.meta['description'], developer=response.meta['author'], app_link=app_link, market="github_opensource", version=response.meta['update_date'].split()[0], download_link=response.meta['download_url'], update_date=response.meta['update_date']) 101 | else: 102 | files_and_folders_url = response.css("div.repository-content div.Details div.Box-row div[role='rowheader'] a::attr('href')").getall() 103 | assert len(files_and_folders_url) == len(svg_labels) 104 | is_folder = [svg_label == "Directory" for svg_label in svg_labels] 105 | for _index_ in range(len(files_and_folders_url)): 106 | if is_folder[_index_]: 107 | yield scrapy.Request( 108 | response.urljoin(files_and_folders_url[_index_]), callback=self.parse_folder_check 109 | ) 110 | -------------------------------------------------------------------------------- /codes/apks/spiders/xiaomi_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import os 4 | import scrapy 5 | from urllib import parse 6 | 7 | import items 8 | 9 | max_callback_time = 10 10 | 11 | 12 | class XiaomiSpider(scrapy.Spider): 13 | name = "Xiaomi Spider" 14 | 15 | def start_requests(self): 16 | start_url = "http://app.mi.com/" 17 | yield scrapy.Request(start_url, callback=self.parse_diff_category) 18 | 19 | def parse_diff_category(self, response: scrapy.http.Response): 20 | """ 21 | parse different categories. 22 | """ 23 | category_urls = response.css(".sidebar ul.category-list li a::attr(href)").getall() 24 | 25 | for category_url in category_urls: 26 | category_id = os.path.basename(category_url) 27 | json_url = "http://app.mi.com/categotyAllListApi?page=0&categoryId={}".format(category_id) 28 | yield scrapy.Request(json_url, callback=self.parse_category_list) 29 | 30 | def parse_category_list(self, response: scrapy.http.Response): 31 | """ 32 | parse category list. 33 | """ 34 | try: 35 | json_data = json.loads(response.text)['data'] 36 | except json.decoder.JSONDecodeError as _err: 37 | callback_time = response.meta.get('callback_time') or 0 38 | if callback_time < 3: 39 | self.logger.warning("Spider crawl {} failed.".format(response.url)) 40 | yield scrapy.Request(response.url, callback=self.parse_category_list, dont_filter=True, meta={"callback_time": callback_time + 1}) 41 | else: 42 | raise _err 43 | return 44 | 45 | if len(json_data) == 0: 46 | return 47 | 48 | # parse app 49 | for app_data in json_data: 50 | package_name = app_data['packageName'] 51 | app_url = "http://app.mi.com/details?id={}".format(package_name) 52 | yield scrapy.Request(app_url, callback=self.parse) 53 | 54 | # parse next page 55 | request_url = response.url 56 | url_data = parse.parse_qs(parse.urlparse(request_url).query) 57 | page_id = int(url_data['page'][0]) + 1 58 | category_id = url_data['categoryId'][0] 59 | new_url = "http://app.mi.com/categotyAllListApi?page={}&categoryId={}".format(page_id, category_id) 60 | yield scrapy.Request(new_url, callback=self.parse_category_list) 61 | 62 | def parse(self, response: scrapy.http.Response, **kwargs): 63 | """ 64 | parse app detail. 65 | """ 66 | # app title 67 | app_title = response.css("div.container div.app-intro div.app-info div.intro-titles h3::text").get() 68 | if not app_title: 69 | callback_time = response.meta.get('callback_time') or 0 70 | if callback_time < 3: 71 | self.logger.warning("Spider crawl {} failed.".format(response.url)) 72 | yield scrapy.Request(response.url, callback=self.parse, dont_filter=True, meta={"callback_time": callback_time + 1}) 73 | return 74 | else: 75 | raise ValueError("App Title Error!") 76 | app_title = app_title.strip() 77 | 78 | # get the information 79 | left_information = response.css("div.container div.float-left div:nth-child(2)::text").getall() 80 | right_information = response.css("div.container div.float-right div:nth-child(2)::text").getall() 81 | if left_information is None or len(left_information) != 4: 82 | raise ValueError("Get Left Information Error!") 83 | if right_information is None or len(right_information) != 4: 84 | raise ValueError("Get Right Information Error!") 85 | 86 | # apk name 87 | apk_name = left_information[3].strip() 88 | 89 | # update date 90 | update_date = left_information[2].strip() 91 | 92 | # introduction 93 | try: 94 | introduction = "\n".join(response.css("div.app-text p.pslide::text").getall()) 95 | except IndexError: 96 | raise ValueError("App Introduction Error!") 97 | 98 | # link 99 | app_link = response.url 100 | 101 | # category 102 | category = response.css("div.container div.app-intro div.app-info div.intro-titles p.special-font::text").get() 103 | if not category: 104 | raise ValueError("Category Error!") 105 | category = category.strip() 106 | 107 | # developer 108 | developer = right_information[1].strip() 109 | 110 | # market 111 | market = "xiaomi" 112 | 113 | # version 114 | version = left_information[1].strip() 115 | 116 | # size 117 | size = left_information[0].strip() 118 | 119 | # pictures 120 | picture_urls = response.css("div.bigimg-scroll div.img-list img::attr(src)").getall() 121 | 122 | # download link 123 | download_link = response.css("div.app-info-down a::attr(href)").get() 124 | if not download_link: 125 | raise ValueError("Download Link Error!") 126 | download_link = response.urljoin(download_link) 127 | 128 | # yield app detail 129 | app_detail = items.AppDetail( 130 | app_title=app_title, apk_name=apk_name, description=introduction, app_link=app_link, category=category, market=market, version=version, picture_links=picture_urls, size=size, download_link=download_link, developer=developer, update_date=update_date 131 | ) 132 | yield app_detail 133 | 134 | # parse the related app 135 | related_urls = response.css("div.second-imgbox li>a::attr(href)").getall() 136 | for related_url in related_urls: 137 | new_url = response.urljoin(related_url) 138 | yield scrapy.Request(new_url, callback=self.parse) 139 | -------------------------------------------------------------------------------- /codes/apks/settings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # Scrapy settings for apks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # https://docs.scrapy.org/en/latest/topics/settings.html 8 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 9 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 10 | import os 11 | 12 | BOT_NAME = 'apks' 13 | 14 | SPIDER_MODULES = ['apks.spiders'] 15 | NEWSPIDER_MODULE = 'apks.spiders' 16 | 17 | # Obey robots.txt rules 18 | ROBOTSTXT_OBEY = False 19 | 20 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 21 | CONCURRENT_REQUESTS = 16 22 | 23 | # Configure a delay for requests for the same website (default: 0) 24 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 25 | # See also autothrottle settings and docs 26 | DOWNLOAD_DELAY = 3 27 | DOWNLOAD_MAXSIZE = 0 # disable it 28 | DOWNLOAD_WARNSIZE = 1073741824 # 1024 MB 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | # CONCURRENT_REQUESTS_PER_IP = 16 32 | DOWNLOAD_TIMEOUT = 360 33 | 34 | # Disable cookies (enabled by default) 35 | COOKIES_ENABLED = True 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 43 | 'Accept-Language': 'zh-CN,zh;q=0.9', 44 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'apks.middlewares.ApksSpiderMiddleware': 543, 51 | # } 52 | REFERRER_POLICY = "scrapy-default" 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 58 | 'apks.middlewares.UserAgentMiddleware': 100 59 | } 60 | USER_AGENT_LIST = [ 61 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' 62 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 63 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 64 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 65 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 66 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 67 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 68 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 69 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 70 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" 71 | ] 72 | 73 | # Enable or disable extensions 74 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 75 | EXTENSIONS = { 76 | 'scrapy.extensions.telnet.TelnetConsole': None, 77 | } 78 | 79 | # Configure item pipelines 80 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 81 | ITEM_PIPELINES = { 82 | 'apks.pipelines.page_content_pipeline.ContentPipeline': 100, 83 | 'apks.pipelines.apk_download_pipeline.ApkDownloadPipeline': 300, 84 | 'apks.pipelines.image_download_pipeline.ImageDownloadPipeline': 500 85 | } 86 | 87 | FILES_STORE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../data") 88 | IMAGES_STORE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../data") 89 | 90 | # Enable and configure the AutoThrottle extension (disabled by default) 91 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 92 | # AUTOTHROTTLE_ENABLED = True 93 | # The initial download delay 94 | # AUTOTHROTTLE_START_DELAY = 5 95 | # The maximum download delay to be set in case of high latencies 96 | # AUTOTHROTTLE_MAX_DELAY = 60 97 | # The average number of requests Scrapy should be sending in parallel to 98 | # each remote server 99 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 100 | # Enable showing throttling stats for every response received: 101 | # AUTOTHROTTLE_DEBUG = False 102 | 103 | # Enable and configure HTTP caching (disabled by default) 104 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 105 | # HTTPCACHE_ENABLED = True 106 | # HTTPCACHE_EXPIRATION_SECS = 0 107 | # HTTPCACHE_DIR = 'httpcache' 108 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 109 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 110 | 111 | RETRY_ENABLED = False 112 | FEED_EXPORT_ENCODING = 'utf-8' 113 | 114 | # Log setting 115 | LOG_ENABLED = True 116 | LOG_LEVEL = 'DEBUG' 117 | LOG_STDOUT = True 118 | LOG_ENCODING = 'utf-8' 119 | 120 | # File downloader setting 121 | MEDIA_ALLOW_REDIRECTS = True 122 | 123 | # whether to download the previous failed apk 124 | DOWNLOAD_PRE_FAIL = True 125 | 126 | # Database setting 127 | DB_TYPE = "mysql" 128 | DB_HOST = "localhost" 129 | DB_DATABASE = "apk_merge" 130 | DB_USER = "root" 131 | DB_PASSWORD = "123456" 132 | DB_CHARSET = "utf8" 133 | DB_PROT = 3306 134 | 135 | # Default Values 136 | DEFAULT_SIZE = "UNKNOWN_SIZE" 137 | DEFAULT_DEVELOPER = "UNKNOWN_DEVELOPER" 138 | DEFAULT_ARCHITECTURE = "UNKNOWN_ARCHITECTURE" 139 | DEFAULT_CATEGORY = "UNKNOWN_CATEGORY" 140 | DEFAULT_VERSION = "UNKNOWN_VERSION" 141 | DEFAULT_MARKET = "custom" 142 | 143 | # set for running environment 144 | USING_PROXY = False 145 | PROXY_PATH = "http://127.0.0.1:10809" 146 | 147 | # gui 148 | python_interface = "python" 149 | 150 | # opensource github crawler 151 | crawler_key_words = ["apk", 'android'] 152 | 153 | ProgressBarStyleSheet = ''' 154 | /*设置红色进度条*/ 155 | #RedProgressBar { 156 | text-align: center; /*进度值居中*/ 157 | } 158 | #RedProgressBar::chunk { 159 | background-color: #F44336; 160 | } 161 | 162 | #BlueProgressBar { 163 | text-align: center; /*进度值居中*/ 164 | } 165 | #BlueProgressBar::chunk { 166 | background-color: #2196F3; 167 | } 168 | ''' -------------------------------------------------------------------------------- /documents/使用手册.md: -------------------------------------------------------------------------------- 1 | # 使用手册 2 | 3 | [TOC] 4 | 5 | ## 爬虫安装 6 | 7 | ### `python `安装 8 | 9 | - 注意 :warning::如果 `apt` 下载包比较慢的话,可以考虑修改相应的镜像源,`ubuntu 20.04` 参考博客 https://blog.csdn.net/yscoder/article/details/110681828 进行修改; 10 | 11 | - 注意 :warning::新版本的 `Ubuntu` 中已经自带有 `python3`; 12 | 13 | - 安装 `python3` :具体 `python3` 的版本是 `3.6 ~ 3.8` 均可; 14 | 15 | 具体的安装过程不再赘述,详情可以参考博客 https://my.oschina.net/randomobject/blog/4300469 ,**优先**采用方式一 ,不行再采用方式二; 16 | 17 | 安装完成后在 `shell` 中测试是否安装成功:(以 `python3.8` 为例) 18 | 19 | ```shell 20 | python -V 21 | python3.8 -V #or 22 | python3 -V #or 23 | ``` 24 | 25 | - 常见问题 :warning::​切记,切记,切记,不要删除 `ubuntu` 自带的 `python`,如果不小心删除,不要重启,立即上网搜索解决办法; 26 | 27 | - 修改系统模型 `python3` 指令: 28 | 29 | 如何将系统默认的 `python3` 指向 `python3.8`?参考链接 https://blog.csdn.net/maizousidemao/article/details/102810681 ,**优先**采用方法一; 30 | 31 | - 常见问题 :warning: :下面介绍如何使用是直接默认使用 `python` ; 32 | 33 | 34 | 35 | ### 数据库安装 36 | 37 | - 注意 :warning::**这里先说一下后面配置的要点** 38 | 39 | - 数据库 `root` 密码:务必记住这个密码,记不住就用 `123456`,这样后面程序里也不需要修改配置; 40 | 41 | - 安装 `mysql`: 42 | 43 | 具体的安装过程不再赘述,具体参考博客 https://blog.csdn.net/liang19890820/article/details/105071479 ,一直配置到 `配置远程访问` 之前; 44 | 45 | 安装完成后在 `shell` 中测试安装是否成功:(**不要**在 `root` 权限下运行下面的指令,以普通用户执行,如果你平常执行代码时就是 `root` 用户,那也可以不用在意) 46 | 47 | ```shell 48 | $ mysql -u root -p 49 | Enter password: 123456 50 | ``` 51 | 52 | 如果无法登录,那么很可能是你前面配置的时候出了问题; 53 | 54 | - 初始化数据库: 55 | 56 | ```shell 57 | $ mysql -u root -o 58 | Enter password: 123456 59 | mysql> show databases; # 查看数据库 60 | mysql> create database apk_merge; # 创建apk_merge数据库 61 | mysql> show databases; # 查看数据库 62 | # 自行替换路径 63 | mysql> source /home///documents/apk_merge.sql # 创建数据库表和存储过程,后面的路径修改成你本地的apk_merge.sql文件 64 | ``` 65 | 66 | 67 | 68 | ### 依赖包安装 69 | 70 | - 进入目录:(手册中的路径名为测试时的路径名称,实际配置过程中请与实际的爬虫路径相匹配) 71 | 72 | ```shell 73 | cd ~/workspace/APKCrawler/codes 74 | ``` 75 | 76 | - 注意:warning: :为了不影响 `python` 的正常工作,你可以创建一个 `python` 的虚拟环境,具体的操作流程参考博客 https://zhuanlan.zhihu.com/p/60647332 ,两种方法都非常实用,看个人喜好; 77 | 78 | - 注意:warning: :`pip` 默认用的是国外的镜像源,国内访问可能网速很慢,可以设置一个国内的镜像源,具体的操作流程参考博客 https://zhuanlan.zhihu.com/p/109939711 ; 79 | 80 | - 安装 `python` 依赖包: 81 | 82 | ```shell 83 | pip install -r requirement.txt 84 | ``` 85 | 86 | - 安装 `crontab` 依赖包: 87 | 88 | 具体操作参考博客 https://blog.csdn.net/longgeaisisi/article/details/90477975 . 89 | 90 | - `pyqt5` 依赖包安装: 91 | 92 | 执行下面的脚本时,可能会出现如下错误: 93 | 94 | ```shell 95 | qt.qpa.plugin: Could not load the Qt platform plugin "xcb" in "" even though it was found. 96 | This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem. 97 | 98 | Available platform plugins are: eglfs, linuxfb, minimal, minimalegl, offscreen, vnc, wayland-egl, wayland, wayland-xcomposite-egl, wayland-xcomposite-glx, webgl, xcb. 99 | 100 | Aborted (core dumped) 101 | ``` 102 | 103 | 为了解决这个问题,执行指令: 104 | 105 | ```shell 106 | sudo apt-get install libxcb-xinerama0 107 | ``` 108 | 109 | 如果安装了该辅助包后,`pyqt5` 仍无法正常运行,则参考博客 https://blog.csdn.net/zhanghm1995/article/details/106474505 来具体查看缺少的依赖包; 110 | 111 | 112 | 113 | ## 爬虫使用 114 | 115 | ### 远程桌面 116 | 117 | 为了能够在局域网(服务器不联网的情况下),远程桌面连接到服务器,针对 `ubuntu 20.04` 可以参考博客 https://zhuanlan.zhihu.com/p/345738274 对服务器进行配置,配置完成后,可以通过 `windows` 自带的远程桌面连接到服务器; 118 | 119 | 120 | 121 | ### 更改文件路径 122 | 123 | #### 方法一 124 | 125 | 由于多数情况下爬虫需要和 **安装包分析脚本** 搭配使用,所以需要保证两个脚本之间的文件保存路径一致,故这里给出一种更加直接的修改方式:`文件软链接` 126 | 127 | #### 方法二 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | ### 参数配置 136 | 137 | > 使用爬虫前,还有一些不得不做的配置 138 | 139 | - 爬虫并发数:默认为 `16` ,配置位置 `settings.py` 文件 `CONCURRENT_REQUESTS` 参数; 140 | - 下载延迟时间:默认为 `3` 秒,配置位置 `settings.py` 文件 `DOWNLOAD_DELAY` 参数; 141 | - 下载保存位置:默认为 `data` 文件夹(没有特殊情况,无需修改),配置位置 `settings.py` 文件 `FILES_STORE` 和 `IMAGES_STORE` 参数; 142 | - 爬虫日志:默认开启 `DEBUG` 级别的日志,同时在命令行中输出,配置位置 `settings.py` 文件 `LOG_ENABLED` 、 `LOG_LEVEL` 和 `LOG_STDOUT` 参数; 143 | - 数据库设置:默认为 `root@localhost` 的 `apk_merge` 库,端口为 `3306`,用户名为 `root` ,密码为 `123456` ,配置位置 `settings.py` 文件 `DB_*` 参数; 144 | - 代理配置:默认**不**启用 `http://127.0.0.1:10809` 的代理(:warning: 是否启动代理还可以在启动爬虫的时候进行配置,这个代理需要你额外已经配置了第三方的代理服务),配置位置 `settings.py` 文件 `USING_PROXY` 和 `PROXY_PATH` 参数; 145 | - `Python` 接口:默认为 `python` (:warning: 这里需要保证和你命令行里使用的 `python` 指令相同,如果你用了虚拟环境,这个就不用修改;如果你用的是 `跑python3` ,那么这里修改为 `python3`,以此类推),配置位置 `settings.py` 文件 `python_interface` 参数; 146 | - 开源代码爬虫关键字:默认为 `["apk", 'android']`(爬虫 `github_opensource` 以这些关键字在网页上检索),配置位置 `settings.py` 文件 `crawler_key_words` 参数; 147 | 148 | 149 | 150 | ### 爬虫 Crawler GUI 151 | 152 | #### 命令行启动 153 | 154 | 服务器中,我们通常会在桌面创建一个 `桌面快捷启动程序`,如果你要在命令行启动相应的 `GUI` 界面,你可以在**工程目录下**运行如下指令: 155 | 156 | ```shell 157 | python crawler_gui.py 158 | ``` 159 | 160 | 启动后的界面如下所示:这里将根据标号的顺序进行说明(文档版本可能会落后于实际的爬虫脚本版本,但整体上的功能都是相似的) 161 | 162 | image-20210617174018715 163 | 164 | #### 爬虫 165 | 166 | - 启动:在 ① 中,选择目标爬虫,点击按钮 ② `Start` 即可启动爬虫,且按钮 ② 将转换成 `Stop` 按钮,下图展示启动后的界面截图; 167 | 168 | image-20210617174458840 169 | 170 | - 停止:若要停止,点击按钮 ② `Stop` 即可关闭爬虫; 171 | - 日志:默认情况下,爬虫的日志文件均在 `./log` 文件夹下,并且在运行时会展示在日志框 ③ 中; 172 | - 注意 :warning::如果你执行完 **‘启动’** 步骤后,日志框 ③ 中没有任何打印信息,说明爬虫没有被正常启动,最大的可能是 “参数配置”——“`Python` 接口” 没有正确配置; 173 | 174 | #### 定时任务 175 | 176 | > 由于定时任务需要依赖于 `Linux` 平台的 Cron 包,所以在 `Windows` 上该功能不能使用,同时,该文档希望你在设置定时人五千,对 `crontab` 有所了解; 177 | 178 | - 添加定时任务:点击按钮 ④ `Add Timer` ,输入相应的参数后,即可添加一个定时任务; 179 | - 删除定时任务:选中相应的定时任务后, 点击按钮 ⑤ `Delete Timer` 即可删除相应的定时任务; 180 | - 显示定时任务:已经设置的定时任务会在表格框 ⑥ 中显示; 181 | - 额外操作:可以通过**命令行** `crontab -e` 来编辑定时任务; 182 | 183 | #### 桌面启动 184 | 185 | 桌面启动时非常简单,直接双击桌面图标即可,至于如何制作一个有效的桌面快捷启动方式,参考博客 https://www.jianshu.com/p/c3486d0a91e4; 186 | 187 | 188 | 189 | ### 数据 Data GUI 190 | 191 | #### 命令行启动 192 | 193 | 在工程文件夹下执行如下命令: 194 | 195 | ```shell 196 | python data_gui.py 197 | ``` 198 | 199 | 启动后的界面如下所示: 200 | 201 | ![image-20210617212440603](pictures/image-20210617212251905.png) 202 | 203 | #### 搜索 204 | 205 | - 在 ① 中选择筛选的条件,点击 `Search` ② 按钮后,即可查询相应的 APK 文件,显示在 ④ 中; 206 | - 在 ⑥ 中你还可以将一个 APK 文件直接拖到 **信息栏**(最右侧矩形框)中查看相应的 APK 详情; 207 | - 下面展示一组检索后的结果: 208 | 209 | ![image-20210617212601123](pictures/image-20210617212601123.png) 210 | 211 | #### 导入本地的 APK 212 | 213 | - 点击 ③ 中的按钮,选择相应的文件夹,即可导入本地的 APK; 214 | 215 | 216 | #### 删除 APK 217 | 218 | - 单个删除:点击 ⑤ 中的 `Delete APK` 即可删除 **信息栏**(最右侧矩形框) 中的 APK; 219 | - 批量删除:点击 ⑤ 中的 `Delete APKs From Folder` 即可将整个文件夹中的 APK 从数据库中删除; 220 | 221 | 222 | 223 | ### 增量导出导入 224 | 225 | > 首先需要说明的是,这里实现增量导出导入的前提是,数据库 A,B,C 他们的基础 APK 信息都是相同的,即 `app` 等表中的信息是相同的。因此,为保证这个前提,**必须保证**只有一个数据库参与到了爬虫的运行,其他两个库只用于安装包的分析等工作,否则会发生数据库信息错乱; 226 | 227 | #### 重置批量导出状态 228 | 229 | 一般情况下不需要执行该指令,只有当拷贝出现问题的时候,可以将批量导出的状态恢复为前一次的状态; 230 | 231 | ```shell 232 | python copy_data_incrementally.py ../../data/ # 将拷贝状态重置为目标文件的状态 233 | ``` 234 | 235 | #### 增量导出 236 | 237 | 执行脚本如下: 238 | 239 | ```shell 240 | python copy_data_incrementally.py # 将文件夹和数据库数据拷贝到目标文件夹 241 | ``` 242 | 243 | #### 增量导入 244 | 245 | ```shell 246 | python copy_data_incrementally.py # 将文件夹和数据库数据从目标文件夹拷贝到data文件夹中 247 | ``` 248 | 249 | -------------------------------------------------------------------------------- /codes/apks/copy_data_incrementally.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import argparse 3 | import glob 4 | import json 5 | import logging 6 | import os 7 | import shutil 8 | import time 9 | from subprocess import Popen 10 | 11 | from database import Database 12 | from settings import DB_DATABASE 13 | 14 | cur_folder = os.path.dirname(os.path.abspath(__file__)) 15 | data_folder = os.path.join(cur_folder, "../../data") 16 | copy_status_file = os.path.join(cur_folder, "../../data/copy_status.json") 17 | 18 | 19 | def restore_copy_status(pre_copy_status_file: str = ""): 20 | if not pre_copy_status_file: # reset the copy status 21 | with open(copy_status_file, 'w') as _file_: 22 | json.dump({}, _file_) 23 | else: 24 | if not os.path.exists(pre_copy_status_file): 25 | raise FileNotFoundError("File {} not found.".format(pre_copy_status_file)) 26 | try: 27 | with open(copy_status_file, "r") as _file_: 28 | json.load(_file_) 29 | except Exception: 30 | raise ValueError("File is not a valid file.".format(pre_copy_status_file)) 31 | else: 32 | shutil.copyfile(pre_copy_status_file, copy_status_file) 33 | 34 | logging.info("Restoring the copy status Success.") 35 | 36 | 37 | def export_data(target_root_folder: str): 38 | assert target_root_folder is not None 39 | os.makedirs(target_root_folder, exist_ok=True) 40 | 41 | if os.path.exists(copy_status_file): 42 | with open(copy_status_file, 'r') as _file_: 43 | copy_status = json.load(_file_) 44 | else: 45 | copy_status = {} 46 | 47 | version_copy_folder_num = 0 48 | for platform_folder in glob.glob(os.path.join(data_folder, "*")): # platform 49 | if not os.path.isdir(platform_folder): 50 | continue 51 | platform_name = os.path.basename(platform_folder) 52 | platform_copy_status = copy_status.get(platform_name) 53 | if platform_copy_status is None: 54 | platform_copy_status = {} 55 | 56 | for apk_folder in glob.glob(os.path.join(platform_folder, "*")): # apk 57 | if not os.path.isdir(apk_folder): 58 | continue 59 | apk_name = os.path.basename(apk_folder) 60 | apk_copy_status = platform_copy_status.get(apk_name) 61 | if apk_copy_status is None: 62 | apk_copy_status = {} 63 | 64 | for version_folder in glob.glob(os.path.join(apk_folder, "*")): # version 65 | if not os.path.isdir(version_folder): 66 | continue 67 | version_name = os.path.basename(version_folder) 68 | version_copy_status = platform_copy_status.get(version_name) 69 | if version_copy_status is None: 70 | version_copy_status = {} 71 | 72 | last_modify_time = os.path.getmtime(version_folder) 73 | for t_file in glob.glob(os.path.join(version_folder, "*")): # file in version folder 74 | last_modify_time = max(last_modify_time, os.path.getmtime(t_file)) 75 | 76 | p_last_modify_time = version_copy_status.get("last_modify_time") 77 | if p_last_modify_time is None: 78 | p_last_modify_time = -1 79 | 80 | if last_modify_time > p_last_modify_time: # need to copy 81 | last_copy_time = int(time.time()) 82 | target_version_folder = os.path.join(target_root_folder, platform_name, apk_name, version_name) 83 | shutil.copytree(version_folder, target_version_folder) 84 | version_copy_folder_num += 1 85 | 86 | # update status 87 | version_copy_status.update({ 88 | "last_copy_time": last_copy_time, 89 | "last_modify_time": last_modify_time 90 | }) 91 | apk_copy_status.update({ 92 | version_name: version_copy_status 93 | }) 94 | platform_copy_status.update({ 95 | apk_name: apk_copy_status 96 | }) 97 | copy_status.update({ 98 | platform_name: platform_copy_status 99 | }) 100 | 101 | # write the copy status 102 | with open(copy_status_file, 'w') as _file_: 103 | json.dump(copy_status, _file_) 104 | 105 | logging.info("Copy the apk data to folder {} Success. Total copy {} folders.".format(target_root_folder, version_copy_folder_num)) 106 | 107 | 108 | def export_database(target_root_folder: str): 109 | assert target_root_folder is not None 110 | os.makedirs(target_root_folder, exist_ok=True) 111 | 112 | # 直接导出数据库中的全部信息 113 | target_sql_file = os.path.join( 114 | os.path.abspath(target_root_folder), "apk_merge.sql" 115 | ) 116 | with open(target_sql_file, 'w') as _file_: 117 | Popen(["mysqldump", "-uroot", "-p123456", DB_DATABASE], stdout=_file_) 118 | 119 | 120 | def import_data(target_root_folder: str): 121 | for market_folder in glob.glob(os.path.join(target_root_folder, "*")): 122 | if os.path.isdir(market_folder): 123 | market_name = os.path.basename(market_folder) 124 | dst_folder = os.path.join(data_folder, market_name) 125 | shutil.copytree(market_folder, dst_folder) 126 | logging.info("Import data from Folder {} Success.".format(target_root_folder)) 127 | 128 | 129 | def import_database(target_root_folder: str): 130 | if not os.path.exists(os.path.join(target_root_folder, "apk_merge.sql")): 131 | raise FileNotFoundError("File apk_merge.sql is not exist in Folder {}.".format(target_root_folder)) 132 | """ 133 | 首先把本地数据库中的额外数据导出 134 | """ 135 | # 权限表, 权限关系表 136 | target_sql_file = os.path.join( 137 | os.path.abspath(target_root_folder), "local.sql" 138 | ) 139 | with open(target_sql_file, 'w') as _file_: 140 | Popen(["mysqldump", "-uroot", "-p123456", DB_DATABASE, "authority", "authority_relation"], stdout=_file_) 141 | 142 | # update表 143 | db = Database() 144 | update_info_json = db.get_local_update_info() 145 | target_json_file = os.path.join( 146 | os.path.abspath(target_root_folder), "local.json" 147 | ) 148 | with open(update_info_json, 'w') as _file_: 149 | json.dump(target_json_file, _file_) 150 | logging.info("Export update info to File {} Success.".format(target_json_file)) 151 | 152 | """ 153 | 然后将外部的sql脚本运行,覆盖本地数据 154 | """ 155 | apk_merge_sql_file = os.path.join( 156 | os.path.abspath(target_root_folder), "apk_merge.sql" 157 | ) 158 | with open(apk_merge_sql_file, 'r') as _file_: 159 | line = _file_.readline() 160 | while line: 161 | db.execute(line) 162 | line = _file_.readline() 163 | 164 | """ 165 | 将本地语句库中的额外数据再导入 166 | """ 167 | # 权限表,权限关系表 168 | with open(target_sql_file, 'r') as _file_: 169 | line = _file_.readline() 170 | while line: 171 | db.execute(line) 172 | line = _file_.readline() 173 | 174 | # update表 175 | with open(target_json_file, 'r') as _file_: 176 | update_info = json.load(_file_) 177 | db.insert_local_update_info(update_info) 178 | 179 | logging.info("Import Done!") 180 | 181 | 182 | if __name__ == '__main__': 183 | parser = argparse.ArgumentParser("Script for copying data incrementally.") 184 | parser.add_argument("target_path", type=str, help="Export data to which folder or Import data from which folder or restore copy status from which file.") 185 | parser.add_argument("--import_mode", default=False, type=bool, help="Whether on import mode. Default is False.") 186 | parser.add_argument("--restore_copy_mode", default=False, type=bool, help="If True, Script will restore the copy status from target_path(a pre-status json file) instead of import or export the data.") 187 | args = parser.parse_args() 188 | 189 | if args.restore_copy_mode: 190 | restore_copy_status(args.target_path) 191 | exit(0) 192 | 193 | if args.import_mode: 194 | import_data(args.target_path) 195 | import_database(args.target_path) 196 | else: 197 | export_data(args.target_path) 198 | export_database(args.target_path) 199 | -------------------------------------------------------------------------------- /codes/apks/crawler_gui.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import platform 3 | import sys 4 | import datetime 5 | from PyQt5.QtWidgets import * 6 | from PyQt5.QtGui import * 7 | from crontab import CronTab, CronItem 8 | from PyQt5 import QtGui 9 | from settings import ProgressBarStyleSheet 10 | from ui_thread import * 11 | from custom_ui import TimerGUI 12 | 13 | BUTTON_HEIGHT = 25 14 | 15 | __current_folder_path__ = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | 18 | class CrawlerGUI(QWidget): 19 | # Main GUI for crawler module. 20 | 21 | def __init__(self): 22 | super(CrawlerGUI, self).__init__() 23 | self.layout_init() 24 | 25 | def layout_init(self): 26 | # 整体大小 27 | height = 800 28 | width = 700 29 | self.setFixedSize(width, height) 30 | self.setWindowTitle("Crawler GUI") 31 | 32 | # 设置字体 33 | self.setFont(QFont("Microsoft YaHei", 8.5)) 34 | 35 | # set layout 36 | root_layout = QVBoxLayout() 37 | crawler_widget = QGroupBox(title="APK Crawler") 38 | util_widget = QGroupBox(title="Crawler Util") 39 | root_layout.addWidget(crawler_widget) 40 | root_layout.addWidget(util_widget) 41 | root_layout.setStretch(0, 3) 42 | root_layout.setStretch(1, 2) 43 | 44 | # save the widget value 45 | self.crawler_widget = crawler_widget 46 | self.util_widget = util_widget 47 | 48 | # init the sub layout 49 | self.util_widget_init() 50 | self.crawler_widget_init() 51 | 52 | # show 53 | self.setLayout(root_layout) 54 | 55 | def crawler_widget_init(self): 56 | # set layout 57 | crawler_layout = QVBoxLayout() 58 | crawler_top_layout = QHBoxLayout() 59 | crawler_layout.addLayout(crawler_top_layout) 60 | crawler_log_text = QTextBrowser() 61 | crawler_layout.addWidget(crawler_log_text) 62 | self.crawler_widget.setLayout(crawler_layout) 63 | 64 | # save the value 65 | self.crawler_layout = crawler_layout 66 | self.crawler_top_layout = crawler_top_layout 67 | self.crawler_log_text = crawler_log_text 68 | 69 | # init the sub layout 70 | self.crawler_top_layout_init() 71 | 72 | def crawler_top_layout_init(self): 73 | # set layout 74 | crawler_combobox = QComboBox() 75 | crawler_combobox.setFixedWidth(150) 76 | crawler_combobox.setFixedHeight(BUTTON_HEIGHT) 77 | start_crawl_button = QPushButton("Start") 78 | start_crawl_button.setFixedHeight(BUTTON_HEIGHT) 79 | stop_crawl_button = QPushButton("Stop") 80 | stop_crawl_button.setFixedHeight(BUTTON_HEIGHT) 81 | stop_crawl_button.setVisible(False) 82 | self.crawler_top_layout.addWidget(crawler_combobox) 83 | self.crawler_top_layout.addStretch() 84 | self.crawler_top_layout.addWidget(start_crawl_button) 85 | self.crawler_top_layout.addWidget(stop_crawl_button) 86 | 87 | # save the value 88 | self.crawler_combobox = crawler_combobox 89 | self.start_crawl_button = start_crawl_button 90 | self.stop_crawl_button = stop_crawl_button 91 | 92 | def util_widget_init(self): 93 | # set layout 94 | util_layout = QVBoxLayout() 95 | util_button_layout = QHBoxLayout() 96 | timer_table_widget = QTableWidget() 97 | util_layout.addLayout(util_button_layout) 98 | util_layout.addWidget(timer_table_widget) 99 | util_layout.addStretch() 100 | self.util_widget.setLayout(util_layout) 101 | 102 | # save the value 103 | self.util_layout = util_layout 104 | self.util_button_layout = util_button_layout 105 | self.timer_table_widget = timer_table_widget 106 | 107 | # init the sub layout 108 | self.util_button_layout_init() 109 | self.timer_table_widget_init() 110 | 111 | def util_button_layout_init(self): 112 | # set layout 113 | add_timer_button = QPushButton("Add Timer") 114 | add_timer_button.setFixedHeight(BUTTON_HEIGHT) 115 | delete_timer_button = QPushButton("Delete Timer") 116 | delete_timer_button.setFixedHeight(BUTTON_HEIGHT) 117 | self.util_button_layout.addStretch() 118 | self.util_button_layout.addWidget(add_timer_button) 119 | self.util_button_layout.addWidget(delete_timer_button) 120 | 121 | # save the value 122 | self.add_timer_button = add_timer_button 123 | self.delete_timer_button = delete_timer_button 124 | 125 | def timer_table_widget_init(self): 126 | # set layout 127 | self.timer_table_widget.setColumnCount(5) 128 | self.timer_table_widget.setHorizontalHeaderLabels(['Month', "Day", "Hour", "Minute", "Week Day", "Crawler"]) 129 | self.timer_table_widget.setSelectionBehavior(QAbstractItemView.SelectRows) 130 | self.timer_table_widget.setEditTriggers(QAbstractItemView.NoEditTriggers) 131 | self.timer_table_widget.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) 132 | self.timer_table_widget.horizontalHeader().setSectionsClickable(False) 133 | 134 | def add_apk_layout_init(self): 135 | # set layout 136 | add_apk_button = QPushButton(QIcon(os.path.join(__current_folder_path__, "./images/folder_import.png")), "Import APK From Folder") 137 | add_apk_button.setFixedHeight(BUTTON_HEIGHT) 138 | add_apk_progress_bar = QProgressBar() 139 | add_apk_progress_bar.setObjectName("BlueProgressBar") 140 | add_apk_progress_bar.setStyleSheet(ProgressBarStyleSheet) 141 | add_apk_progress_bar.setVisible(False) 142 | self.add_apk_layout.addWidget(add_apk_button) 143 | self.add_apk_layout.addWidget(add_apk_progress_bar) 144 | 145 | # save the value 146 | self.add_apk_button = add_apk_button 147 | self.add_apk_progress_bar = add_apk_progress_bar 148 | 149 | 150 | class CrawlerProcess(CrawlerGUI): 151 | scrapy_log_signal = pyqtSignal(str) 152 | scrapy_start_signal = pyqtSignal() 153 | scrapy_finish_signal = pyqtSignal() 154 | error_signal = pyqtSignal(str) 155 | crawler_list = ["fossdroid", "xiaomi", "apkpure", "github", "github_opensource"] 156 | timer_list = [] 157 | timer_model = None 158 | user_crontab = None 159 | 160 | def __init__(self): 161 | super(CrawlerProcess, self).__init__() 162 | self.thread_pool = QThreadPool() 163 | self.thread_pool.globalInstance() 164 | self.thread_pool.setMaxThreadCount(8) 165 | 166 | self.bind_error() 167 | self.load_data() 168 | self.bind_scrapy() 169 | self.bind_timer() 170 | self.check_value() 171 | 172 | """ 173 | 加载ComboBox数据 174 | """ 175 | 176 | def load_data(self): 177 | # crawler combobox 178 | self.crawler_combobox.addItems(self.crawler_list) 179 | 180 | """ 181 | 设置scrapy 182 | """ 183 | 184 | def bind_scrapy(self): 185 | self.scrapy_worker = ScrapyWorker(self) 186 | self.scrapy_start_signal.connect(self.crawler_log_text.clear) 187 | self.scrapy_finish_signal.connect(self.scrapy_finish) 188 | self.scrapy_log_signal.connect(self.parse_log) 189 | self.start_crawl_button.clicked.connect(self.start_scrapy) 190 | self.stop_crawl_button.clicked.connect(self.stop_scrapy) 191 | 192 | def parse_log(self, text): 193 | pre_cursor = self.crawler_log_text.textCursor() 194 | self.crawler_log_text.moveCursor(QtGui.QTextCursor.End) 195 | self.crawler_log_text.insertPlainText(text) 196 | if self._keep_log_end_: 197 | pre_cursor.movePosition(QtGui.QTextCursor.End) 198 | self.crawler_log_text.setTextCursor(pre_cursor) 199 | 200 | def start_scrapy(self): 201 | if not self.check_value(): 202 | return 203 | 204 | platform = self.crawler_combobox.currentText() 205 | self.start_crawl_button.setVisible(False) 206 | self.stop_crawl_button.setVisible(True) 207 | self.scrapy_worker.run(platform) 208 | self._keep_log_end_ = True 209 | 210 | def stop_scrapy(self): 211 | self.scrapy_worker.stop() 212 | self.stop_crawl_button.setVisible(False) 213 | self.start_crawl_button.setVisible(True) 214 | 215 | def scrapy_finish(self): 216 | self.parse_log("\n\n\n Scrapy Worker Done!") 217 | self.start_crawl_button.setVisible(True) 218 | self.stop_crawl_button.setVisible(False) 219 | 220 | """ 221 | 定时器 222 | """ 223 | 224 | def bind_timer(self): 225 | self.timer_window = TimerGUI() 226 | self.timer_window.load_crawler(self.crawler_list) 227 | self.timer_window.setWindowModality(QtCore.Qt.ApplicationModal) 228 | self.timer_window.timer_signal.connect(self.add_new_timer) 229 | self.add_timer_button.clicked.connect(self.add_timer_button_click) 230 | self.delete_timer_button.clicked.connect(self.delete_timer_button_click) 231 | 232 | self.update_timer() 233 | 234 | def add_timer_button_click(self): 235 | if not self.check_value(): 236 | return 237 | 238 | self.timer_window.reset_edit() 239 | self.timer_window.show() 240 | self.timer_window.exec_() 241 | 242 | def add_new_timer(self, month, day, hour, minute, crawler_name): 243 | if month == -1 and day == -1 and hour == -1 and minute == -1: 244 | return 245 | 246 | if month == -1: 247 | month = "*" 248 | if day == -1: 249 | day = "*" 250 | if hour == -1: 251 | hour = "*" 252 | if minute == -1: 253 | minute = "*" 254 | 255 | if platform.system() == "Windows": 256 | print("Not imply in windows.") 257 | else: 258 | crawler_script_path = os.path.join(__current_folder_path__, "main.py") 259 | crontab_command = "{} {} --market_name {}".format(python_interface, crawler_script_path, crawler_name) 260 | crontab_time = "{} {} {} {} *".format(minute, hour, day, month) 261 | comment = "apk crawler job" 262 | user_crontab = CronTab(user=True) 263 | job = user_crontab.new(command=crontab_command, comment=comment) 264 | job.setall(crontab_time) 265 | job.enable() 266 | user_crontab.write() 267 | 268 | self.update_timer() 269 | 270 | def update_timer(self): 271 | if platform.system() == "Windows": 272 | print("Not imply in windows.") 273 | else: 274 | user_crontab = CronTab(user=True) # todo: 检查一下 275 | job_iter = user_crontab.find_comment("apk crawler job") 276 | self.user_crontab = user_crontab 277 | self.timer_list = list(job_iter) 278 | timer_data_list = [] 279 | for job in self.timer_list: # type: CronItem 280 | month, day, hour, minute = job.month, job.dom, job.hour, job.minute 281 | crawler = job.command.split('--market_name')[1].strip() 282 | timer_data_list.append([month, day, hour, minute, crawler]) 283 | 284 | self.timer_table_widget.clear() 285 | self.timer_table_widget.setRowCount(len(timer_data_list)) 286 | _row_ = 0 287 | for timer_data in timer_data_list: 288 | for _column_ in range(self.timer_table_widget.columnCount()): 289 | q_table_widget_item = QTableWidgetItem(str(timer_data[_column_])) 290 | q_table_widget_item.setTextAlignment(QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter) 291 | self.timer_table_widget.setItem(_row_, _column_, q_table_widget_item) 292 | _row_ += 1 293 | 294 | def delete_timer_button_click(self): 295 | timer_index = self.timer_table_widget.currentIndex().row() 296 | if timer_index == -1: 297 | return 298 | 299 | timer_job = self.timer_list[timer_index] # type: CronItem 300 | self.user_crontab.remove(timer_job) 301 | self.user_crontab.write() 302 | self.update_timer() 303 | 304 | """ 305 | 错误 306 | """ 307 | 308 | def bind_error(self): 309 | self.error_signal.connect(self.catch_error) 310 | 311 | def catch_error(self, _err_: str): 312 | log_file = os.path.join(__current_folder_path__, "../../log/main_gui.{}.log".format(datetime.datetime.now().strftime("%Y-%m-%d-%H"))) 313 | with open(log_file, 'a') as _file_: 314 | _file_.write(_err_) 315 | 316 | def check_value(self): 317 | enviro = True 318 | if python_interface is None: 319 | QMessageBox.warning(self, "Python Interface Error", "Please set the 'python_interface' in setting.py.", QMessageBox.Ok, QMessageBox.Ok) 320 | enviro = False 321 | return enviro 322 | 323 | 324 | if __name__ == '__main__': 325 | app = QApplication(sys.argv) 326 | crawler_gui = CrawlerProcess() 327 | crawler_gui.show() 328 | sys.exit(app.exec_()) 329 | -------------------------------------------------------------------------------- /codes/apks/spiders/apkpure_spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import os 4 | import scrapy 5 | from collections import defaultdict 6 | 7 | import items 8 | import settings 9 | 10 | 11 | class ApkPureSpider(scrapy.Spider): 12 | name = "apkpure" 13 | logger = logging.getLogger("GoogleSpider") 14 | 15 | def start_requests(self): 16 | start_url = "https://apkpure.com/app" 17 | yield scrapy.Request(start_url, callback=self.parse_diff_cate) 18 | 19 | def parse_diff_cate(self, response): 20 | """ 21 | parse different category 22 | """ 23 | categories = response.css("ul.index-category.cicon li a::attr('href')").getall() 24 | 25 | for category in categories: 26 | category_url = response.urljoin(category) 27 | yield scrapy.Request(category_url, callback=self.parse_app_list) 28 | 29 | def parse_app_list(self, response): 30 | """ 31 | parse app list 32 | """ 33 | # 解析应用列表 34 | app_urls = response.css("ul.category-template#pagedata li div.category-template-img a::attr('href')").getall() 35 | for app_url in app_urls: 36 | app_url = response.urljoin(app_url) 37 | yield scrapy.Request(app_url, callback=self.parse_app) 38 | 39 | # 解析下一页 40 | next_url = response.css("a.loadmore::attr('href')").get() 41 | if next_url: 42 | next_url = response.urljoin(next_url) 43 | yield scrapy.Request(next_url, callback=self.parse_app_list) 44 | 45 | def parse_app(self, response): 46 | """ 47 | parse app info page 48 | """ 49 | # 解析应用信息 50 | # app title 51 | app_title = response.css("div.main div.title-like h1::text").get() 52 | if app_title is None: 53 | raise ValueError("App Title Error.") 54 | 55 | # description and update info 56 | description = "Description:\n" + "\n".join(response.css("div#describe div.content::text").getall()) 57 | description += "Update Info:\n" + "\n".join(response.css("div#whatsnew div:nth-child(3)::text").getall()) 58 | 59 | category = response.css("div.additional ul li:first-child a span::text").getall() 60 | if len(category) == 0: 61 | raise ValueError("App Type Error.") 62 | elif len(category) == 1: 63 | self.logger.info("App '{}' is a paid app. Can't Download it.".format(app_title)) 64 | return 65 | else: 66 | category = category[1] 67 | 68 | # latest version 69 | latest_version = response.css("div.additional ul li:nth-child(2) p:nth-child(2)::text").get() 70 | if not latest_version: 71 | raise ValueError("App Latest Version Error.") 72 | 73 | # developer 74 | publisher = response.css("div.left div.box div.details-author p a::text").get() 75 | if not publisher: 76 | raise ValueError("Developer Error.") 77 | 78 | # apk name 79 | package_name = os.path.split(response.url)[-1] 80 | 81 | # app link 82 | apkpure_url = response.url 83 | 84 | # market 85 | market = "apkpure" 86 | 87 | # picture links 88 | picture_links = response.css("div.describe div.describe-img div#slide-box img::attr(src)").getall() 89 | 90 | app_detail = items.AppDetail(app_title=app_title, apk_name=package_name, description=description, developer=publisher, app_link=apkpure_url, category=category, market=market, version=latest_version, picture_links=picture_links) 91 | 92 | # 更多版本 93 | more_version_url = response.css("div.ver-title div.more a::attr('href')").get() 94 | if more_version_url: 95 | more_version_url = response.urljoin(more_version_url) 96 | yield scrapy.Request(more_version_url, meta={"app_detail": app_detail}, callback=self.parse_multi_version) 97 | 98 | # 相似应用 & 同一厂商 99 | more_urls = response.css("div.left div.box div.title div.more a::attr('href')").getall() 100 | for more_url in more_urls: 101 | if "similar" in more_url: 102 | # 相似应用 103 | similar_url = response.urljoin(more_url) 104 | yield scrapy.Request(similar_url, callback=self.parse_similar) 105 | elif "developer" in more_url: 106 | # 同一厂商 107 | developer_url = response.urljoin(more_url) 108 | yield scrapy.Request(developer_url, callback=self.parse_developer, meta={"raw_url": apkpure_url}) 109 | 110 | def parse_multi_version(self, response): 111 | """ 112 | parse multiple version 113 | """ 114 | app_detail = response.meta['app_detail'] 115 | 116 | ver_lis = response.css("div.ver ul.ver-wrap li") 117 | for ver_li in ver_lis: 118 | version = ver_li.css("a div.ver-item-wrap span.ver-item-n::text").get()[1:] 119 | file_types = ver_li.css("div.ver-item div.ver-item-wrap span.ver-item-t::text").getall() 120 | if "XAPK" in file_types: 121 | file_type = "xapk" 122 | else: 123 | file_type = "apk" 124 | ver_info_dom = ver_li.css("div.ver-info") 125 | 126 | if len(ver_info_dom) > 0: # 没有多个变种 127 | # 获取版本信息 128 | p_doms = ver_info_dom.css("div.ver-info-m p") 129 | page_url = ver_li.css("li>a::attr('href')").get() 130 | page_url = response.urljoin(page_url) 131 | ext_infos = defaultdict(str) 132 | for p_dom in p_doms: 133 | try: 134 | _key = p_dom.css("strong::text").get().strip() 135 | except AttributeError: 136 | continue 137 | 138 | if _key: 139 | try: 140 | _key = _key.split(":")[0].strip() 141 | _value = p_dom.css("p::text").get().strip() 142 | except AttributeError as _err: 143 | continue 144 | 145 | if _key == "Requires Android": 146 | ext_infos["requirement"] = _value 147 | elif _key == "Signature": 148 | ext_infos["signature"] = _value 149 | elif _key == "Screen DPI": 150 | ext_infos['dpi'] = _value 151 | elif _key == "Architecture": 152 | ext_infos['architecture'] = _value 153 | elif _key == "Update on": 154 | ext_infos['update_time'] = _value 155 | elif _key == "File Size": 156 | ext_infos['size'] = _value 157 | elif _key == "File SHA1": 158 | ext_infos['hash'] = _value 159 | file_size = ext_infos['size'] or settings.DEFAULT_SIZE 160 | 161 | update_detail = items.AppDetail(app_title=app_detail['app_title'], apk_name=app_detail['apk_name'], developer=app_detail['developer'], app_link=app_detail['app_link'], category=app_detail['category'], market=app_detail['market'], version="{}.{}".format(version, file_type), size=file_size) 162 | 163 | if version == app_detail['version']: 164 | update_detail['description'] = app_detail['description'] 165 | update_detail['picture_links'] = app_detail['picture_links'] 166 | else: 167 | update_detail['description'] = "" 168 | update_detail['picture_links'] = [] 169 | 170 | yield scrapy.Request(page_url, meta={"update_detail": update_detail}, callback=self.parse) 171 | else: # 存在多个变种 172 | variants_url = ver_li.css("a::attr('href')").get() 173 | variants_url = response.urljoin(variants_url) 174 | 175 | yield scrapy.Request(variants_url, meta={"app_detail": app_detail}, callback=self.parse_multi_varia) 176 | 177 | def parse_similar(self, response): 178 | """ 179 | parse similar apps 180 | """ 181 | # 解析相似app列表 182 | similar_apps = response.css("div.main div.box ul#pagedata li dd.title-dd a::attr('href')").getall() 183 | for similar_app in similar_apps: 184 | app_url = response.urljoin(similar_app) 185 | yield scrapy.Request(app_url, callback=self.parse_app) 186 | 187 | def parse_developer(self, response): 188 | """ 189 | parse the same developer's apps 190 | """ 191 | # 解析app列表 192 | devel_apps = response.css("div.main div.left div.box dl.search-dl p.search-title a::attr('href')").getall() 193 | for devel_app in devel_apps: 194 | app_url = response.urljoin(devel_app) 195 | yield scrapy.Request(app_url, callback=self.parse_app) 196 | 197 | # 下一页 198 | next_page_url = response.css("div.paging ul li:last-child a::attr('href')").get() 199 | if next_page_url: 200 | next_page_url = response.urljoin(next_page_url) 201 | yield scrapy.Request(next_page_url, callback=self.parse_developer) 202 | 203 | def parse(self, response, **kwargs): 204 | """ 205 | parse the download page 206 | """ 207 | update_detail = response.meta['update_detail'] 208 | 209 | # 获取下载地址 210 | download_url = response.css("div.left div.box div.fast-download-box.fast-bottom p.down-click a::attr('href')").get() 211 | if not download_url: 212 | raise ValueError('Get download url Error.') 213 | update_detail['download_link'] = download_url 214 | 215 | yield update_detail 216 | 217 | def parse_multi_varia(self, response): 218 | """ 219 | parse the multi variants 220 | """ 221 | app_detail = response.meta['app_detail'] 222 | variants_dom = response.css("div.left div.table div.table-row")[1:] 223 | version = response.css("div.left div.box div.variant div.info div.tit span::text").get()[1:] 224 | app_version_list = [] 225 | 226 | for variant_dom in variants_dom: 227 | variant_number = variant_dom.css("div.table-cell div.popup span::text").get() 228 | 229 | # 解析更新的信息 230 | ver_info_dom = variant_dom.css("div.table-cell div.ver-info div.ver-info-m") 231 | file_type_str = variant_dom.css("div.table-cell.down a::text").get() 232 | if "XAPK" in file_type_str: 233 | file_type = "xapk" 234 | else: 235 | file_type = "apk" 236 | p_doms = ver_info_dom.css("p") 237 | page_url = variant_dom.css("div.table-cell.down a::attr('href')").get() 238 | page_url = response.urljoin(page_url) 239 | ext_info = defaultdict(str) 240 | for p_dom in p_doms: 241 | try: 242 | _key = p_dom.css("strong::text").get().strip() 243 | except AttributeError: 244 | continue 245 | 246 | if _key: 247 | _key = _key.split(':')[0].strip() 248 | _value = _value = p_dom.css("p::text").get().strip() 249 | 250 | if _key == "Update on": 251 | ext_info['update_time'] = _value 252 | elif _key == "Requires Android": 253 | ext_info['requirement'] = _value 254 | elif _key == "Signature": 255 | ext_info['signature'] = _value 256 | elif _key == "Screen DPI": 257 | ext_info['dpi'] = _value 258 | elif _key == "Architecture": 259 | ext_info['architecture'] = _value 260 | elif _key == "File SHA1": 261 | ext_info['hash'] = _value 262 | elif _key == "File Size": 263 | ext_info['size'] = _value 264 | 265 | app_size = ext_info['size'] or settings.DEFAULT_SIZE 266 | architecture = ext_info['architecture'] or settings.DEFAULT_ARCHITECTURE 267 | update_date = ext_info['update_time'] if ext_info['update_time'] != "" else None 268 | app_version = "{}-{}-{}-{}".format(version, variant_number, architecture, file_type) 269 | 270 | if app_version in app_version_list: 271 | continue 272 | app_version_list.append(app_version) 273 | 274 | update_detail = items.AppDetail( 275 | app_title=app_detail['app_title'], apk_name=app_detail['apk_name'], developer=app_detail['developer'], app_link=app_detail['app_link'], category=app_detail['category'], market=app_detail['market'], version=app_version, size=app_size, update_date=update_date 276 | ) 277 | if version == app_detail['version']: 278 | update_detail['description'] = app_detail['description'] 279 | update_detail['picture_links'] = app_detail['picture_links'] 280 | else: 281 | update_detail['description'] = "" 282 | update_detail['picture_links'] = [] 283 | 284 | yield scrapy.Request(page_url, meta={"update_detail": update_detail}, callback=self.parse) 285 | -------------------------------------------------------------------------------- /codes/apks/ui_thread.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import time 4 | from typing import * 5 | 6 | from PyQt5 import QtCore 7 | from PyQt5.QtCore import * 8 | from database import Database 9 | import glob 10 | from pipelines.folder_path import get_file_size 11 | from utils import cal_file_hash 12 | from settings import DEFAULT_DEVELOPER, DEFAULT_CATEGORY, DEFAULT_MARKET, python_interface 13 | import traceback 14 | 15 | 16 | def catch_exception(func): 17 | def wrapper(self, *args, **kwargs): 18 | try: 19 | return func(self, *args, **kwargs) 20 | except Exception as _err_: 21 | self.communication.error_signal.emit(traceback.format_exc()) 22 | 23 | return wrapper 24 | 25 | 26 | class AutoDeleteRunnable(QRunnable): 27 | def __init__(self): 28 | super(AutoDeleteRunnable, self).__init__() 29 | self.setAutoDelete(True) 30 | 31 | 32 | class SDKLevelThread(AutoDeleteRunnable): 33 | def transfer(self, communication): 34 | self.communication = communication 35 | 36 | @catch_exception 37 | def run(self): 38 | db = Database() 39 | sdk_level_list = db.get_all_sdk_level() 40 | self.communication.sdk_level_signal.emit(sdk_level_list) 41 | 42 | 43 | class AuthorityThread(AutoDeleteRunnable): 44 | def transfer(self, communication): 45 | self.communication = communication 46 | 47 | @catch_exception 48 | def run(self): 49 | db = Database() 50 | authority_list = db.get_all_authority() 51 | self.communication.authority_signal.emit(authority_list) 52 | 53 | 54 | class TypeThread(AutoDeleteRunnable): 55 | def transfer(self, communication): 56 | self.communication = communication 57 | 58 | @catch_exception 59 | def run(self): 60 | db = Database() 61 | type_item_list = db.get_all_app_type() 62 | self.communication.type_signal.emit(type_item_list) 63 | 64 | 65 | class ScrapyWorker(QObject): 66 | def __init__(self, communication, parent=None): 67 | super().__init__(parent) 68 | self.communication = communication 69 | self._process = QtCore.QProcess(self) 70 | self._process.setProcessChannelMode(QtCore.QProcess.MergedChannels) 71 | self._process.setProgram(python_interface) 72 | self._process.readyReadStandardOutput.connect(self.on_readyReadStandardOutput) 73 | self._process.started.connect(self.communication.scrapy_start_signal) 74 | self._process.finished.connect(self.communication.scrapy_finish_signal) 75 | 76 | @catch_exception 77 | def run(self, platform): 78 | self._process.setWorkingDirectory('./') 79 | self._process.setArguments(['./main.py', "--market_name", platform]) 80 | self._process.start() 81 | 82 | def on_readyReadStandardOutput(self): 83 | data = self._process.readAllStandardOutput().data().decode() 84 | self.communication.scrapy_log_signal.emit(data) 85 | 86 | def stop(self): 87 | self._process.kill() 88 | 89 | 90 | class AddAPKThread(AutoDeleteRunnable): 91 | def transfer(self, communication, root_folder): 92 | self.communication = communication 93 | self.root_folder = root_folder 94 | 95 | @catch_exception 96 | def run(self): 97 | file_list = glob.glob(os.path.join(self.root_folder, "**/*"), recursive=True) 98 | file_number = len(file_list) 99 | db = Database() 100 | 101 | finish_number = 0 102 | success_number = 0 103 | repeated_number = 0 104 | error_number = 0 105 | for file in file_list: 106 | if os.path.isfile(file): 107 | try: 108 | apk_name = os.path.splitext(os.path.basename(file))[0] 109 | apk_name = apk_name[:256] # clip the too long string 110 | app_title = apk_name 111 | developer = DEFAULT_DEVELOPER 112 | category = DEFAULT_CATEGORY 113 | market = DEFAULT_MARKET 114 | size = get_file_size(file) 115 | update_date = time.strftime("%Y-%m-%d", time.localtime()) 116 | version = update_date 117 | file_hash = cal_file_hash(file) 118 | if db.insert_app_from_file(market, file_hash, app_title, apk_name, developer, category, version, size, update_date): 119 | success_number += 1 120 | else: 121 | repeated_number += 1 122 | except Exception: 123 | error_number += 1 124 | 125 | finish_number += 1 126 | self.communication.add_progress_signal.emit(finish_number * 1.0 / file_number * 100) 127 | 128 | self.communication.add_apk_signal.emit(success_number, repeated_number, error_number) 129 | 130 | 131 | def generate_sdk_sql_str(sdk_name_list): 132 | if sdk_name_list is None: 133 | return "", [] 134 | sql_str = "" 135 | param_list = [] 136 | for sdk_name in sdk_name_list: 137 | if sdk_name == "UNKNOWN": 138 | if sql_str == "": 139 | sql_str = "sdk_level is NULL " 140 | else: 141 | sql_str += " OR sdk_level is NULL " 142 | else: 143 | param_list.append(sdk_name) 144 | if sql_str == "": 145 | sql_str = "sdk_level=%s " 146 | else: 147 | sql_str += " OR sdk_level=%s " 148 | sql_str = " (" + sql_str[:-1] + ") " 149 | return sql_str, param_list 150 | 151 | 152 | def generate_authority_sql_str(authority_id_list): 153 | if authority_id_list is None: 154 | return "", [] 155 | sql_str = "" 156 | for _ in authority_id_list: 157 | if sql_str == "": 158 | sql_str += " EXISTS (SELECT 1 FROM authority_relation WHERE authority_relation.update_id=`update`.update_id AND authority_id=%s) " 159 | else: 160 | sql_str += " AND EXISTS (SELECT 1 FROM authority_relation WHERE authority_relation.update_id=`update`.update_id AND authority_id=%s) " 161 | return sql_str, authority_id_list 162 | 163 | 164 | def generate_type_sql_str(app_type_list): 165 | if app_type_list is None: 166 | return "", [] 167 | sql_str = "" 168 | for _ in app_type_list: 169 | if sql_str == "": 170 | sql_str = "type_id=%s " 171 | else: 172 | sql_str += " OR type_id=%s " 173 | sql_str = " (" + sql_str[:-1] + ") " 174 | return sql_str, app_type_list 175 | 176 | 177 | class SearchPlatformThread(AutoDeleteRunnable): 178 | def transfer(self, communication, sdk_name_list: Union[None, List] = None, authority_id_list: Union[None, List] = None, type_id_list: Union[None, List] = None): 179 | self.communication = communication 180 | self.sdk_name_list = sdk_name_list 181 | self.authority_id_list = authority_id_list 182 | self.type_id_list = type_id_list 183 | 184 | @catch_exception 185 | def run(self): 186 | db = Database() 187 | print(self.communication, self.sdk_name_list, self.authority_id_list, self.type_id_list) 188 | if self.sdk_name_list is None and self.authority_id_list is None and self.type_id_list is None: 189 | # not apply any search condition 190 | market_list = db.search_platform_not_delete() 191 | else: 192 | sdk_sql_str, sdk_param_list = generate_sdk_sql_str(self.sdk_name_list) 193 | authority_sql_str, authority_param_list = generate_authority_sql_str(self.authority_id_list) 194 | type_sql_str, type_param_list = generate_type_sql_str(self.type_id_list) 195 | sql_str = "select m.market_id, market_name from `update` join app a on `update`.app_id = a.app_id join market m on m.market_id = a.market_id where is_delete=FALSE " 196 | if sdk_sql_str: 197 | sql_str += " AND " + sdk_sql_str 198 | if authority_sql_str: 199 | sql_str += " AND " + authority_sql_str 200 | if type_sql_str: 201 | sql_str += " AND " + type_sql_str 202 | sql_str += " group by market_id order by market_name;" 203 | param_list = sdk_param_list + authority_param_list + type_param_list 204 | 205 | cursor = db.get_cursor() 206 | cursor.execute(sql_str, param_list) 207 | results = cursor.fetchall() 208 | market_list = [] 209 | for result in results: 210 | if result and result['market_name']: 211 | market_list.append({ 212 | "market_id": result['market_id'], 213 | "market_name": result['market_name'].decode('utf-8') 214 | }) 215 | 216 | self.communication.market_signal.emit(market_list) 217 | 218 | 219 | class SearchAppThread(AutoDeleteRunnable): 220 | def transfer(self, communication, market_id, sdk_name_list: Union[None, list] = None, authority_id_list: Union[None, List] = None, type_id_list: Union[None, List] = None): 221 | self.communication = communication 222 | self.market_id = market_id 223 | self.sdk_name_list = sdk_name_list 224 | self.authority_id_list = authority_id_list 225 | self.type_id_list = type_id_list 226 | 227 | @catch_exception 228 | def run(self): 229 | db = Database() 230 | 231 | if self.sdk_name_list is None and self.authority_id_list is None and self.type_id_list is None: 232 | # not apply ant search condition 233 | app_list = db.search_app_not_delete(self.market_id) 234 | else: 235 | sdk_sql_str, sdk_param_list = generate_sdk_sql_str(self.sdk_name_list) 236 | authority_sql_str, authority_param_list = generate_authority_sql_str(self.authority_id_list) 237 | type_sql_str, type_param_list = generate_type_sql_str(self.type_id_list) 238 | sql_str = "select a.app_id, app_title from `update` join app a on `update`.app_id = a.app_id join market m on m.market_id = a.market_id where is_delete=FALSE and a.market_id=%s " 239 | if sdk_sql_str: 240 | sql_str += " AND " + sdk_sql_str 241 | if authority_sql_str: 242 | sql_str += " AND " + authority_sql_str 243 | if type_sql_str: 244 | sql_str += " AND " + type_sql_str 245 | sql_str += " group by app_id order by app_title;" 246 | param_list = [self.market_id] + sdk_param_list + authority_param_list + type_param_list 247 | 248 | cursor = db.get_cursor() 249 | cursor.execute(sql_str, param_list) 250 | results = cursor.fetchall() 251 | app_list = [] 252 | for result in results: 253 | if result and result['app_title']: 254 | app_list.append({ 255 | "app_id": result['app_id'], 256 | "app_title": result['app_title'].decode('utf-8') 257 | }) 258 | 259 | self.communication.app_signal.emit(app_list) 260 | 261 | 262 | class SearchUpdateThread(AutoDeleteRunnable): 263 | def transfer(self, communication, app_id, sdk_name_list: Union[None, list] = None, authority_id_list: Union[None, List] = None, type_id_list: Union[None, List] = None): 264 | self.communication = communication 265 | self.app_id = app_id 266 | self.sdk_name_list = sdk_name_list 267 | self.authority_id_list = authority_id_list 268 | self.type_id_list = type_id_list 269 | 270 | @catch_exception 271 | def run(self): 272 | db = Database() 273 | 274 | if self.sdk_name_list is None and self.authority_id_list is None and self.type_id_list is None: 275 | # not apply ant search condition 276 | update_list = db.search_update_not_delete(self.app_id) 277 | else: 278 | sdk_sql_str, sdk_param_list = generate_sdk_sql_str(self.sdk_name_list) 279 | authority_sql_str, authority_param_list = generate_authority_sql_str(self.authority_id_list) 280 | sql_str = "select update_id, version from `update` where is_delete=FALSE and app_id=%s " 281 | if sdk_sql_str: 282 | sql_str += " AND " + sdk_sql_str 283 | if authority_sql_str: 284 | sql_str += " AND " + authority_sql_str 285 | sql_str += " order by version;" 286 | param_list = [self.app_id] + sdk_param_list + authority_param_list 287 | 288 | cursor = db.get_cursor() 289 | cursor.execute(sql_str, param_list) 290 | results = cursor.fetchall() 291 | update_list = [] 292 | for result in results: 293 | if result and result['version']: 294 | update_list.append({ 295 | "update_id": result['update_id'], 296 | "version": result['version'].decode('utf-8') 297 | }) 298 | 299 | self.communication.update_signal.emit(update_list) 300 | 301 | 302 | class SearchApkInfoByUpdateIdThread(AutoDeleteRunnable): 303 | def transfer(self, communication, update_id): 304 | self.communication = communication 305 | self.update_id = update_id 306 | 307 | @catch_exception 308 | def run(self): 309 | db = Database() 310 | update_info_list = db.get_information_by_update_id(self.update_id) 311 | self.communication.update_information_signal.emit(update_info_list) 312 | 313 | 314 | class DeleteApkThread(AutoDeleteRunnable): 315 | def transfer(self, communication, update_id_list): 316 | self.communication = communication 317 | self.update_id_list = update_id_list 318 | 319 | @catch_exception 320 | def run(self): 321 | db = Database() 322 | for update_id in self.update_id_list: 323 | db.delete_apk_by_update_id(update_id) 324 | self.communication.delete_apk_signal.emit() 325 | 326 | 327 | class DragSearchThread(AutoDeleteRunnable): 328 | def transfer(self, communication, file_url): 329 | self.communication = communication 330 | self.file_path = file_url 331 | 332 | @catch_exception 333 | def run(self): 334 | db = Database() 335 | file_hash = cal_file_hash(self.file_path) 336 | update_info_list = db.get_information_by_file_hash(file_hash) 337 | self.communication.update_information_signal.emit(update_info_list) 338 | 339 | 340 | class MultiDeleteThread(AutoDeleteRunnable): 341 | def transfer(self, communication, folder_path): 342 | self.communication = communication 343 | self.folder_path = folder_path 344 | 345 | @catch_exception 346 | def run(self): 347 | db = Database() 348 | hash_list = [] 349 | file_list = glob.glob(os.path.join(self.folder_path, "**/*"), recursive=True) 350 | file_number = len(file_list) 351 | 352 | current_number = 0 353 | for file in file_list: 354 | current_number += 1 355 | if os.path.isfile(file): 356 | hash_list.append( 357 | cal_file_hash(file) 358 | ) 359 | self.communication.delete_progress_signal.emit( 360 | current_number * 1.0 / file_number * 100 * 0.5 361 | ) 362 | 363 | hash_number = len(hash_list) 364 | current_number = 0 365 | for apk_hash in hash_list: 366 | current_number += 1 367 | db.delete_apk_by_hash(apk_hash) 368 | self.communication.delete_progress_signal.emit( 369 | current_number * 1.0 / hash_number * 100 * 0.5 + 0.5 370 | ) 371 | 372 | self.communication.delete_apk_signal.emit() 373 | -------------------------------------------------------------------------------- /documents/apk_merge.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat Premium Data Transfer 3 | 4 | Source Server : local_mysql_database 5 | Source Server Type : MySQL 6 | Source Server Version : 80019 7 | Source Host : localhost:3306 8 | Source Schema : apk_merge 9 | 10 | Target Server Type : MySQL 11 | Target Server Version : 80019 12 | File Encoding : 65001 13 | 14 | Date: 15/12/2020 19:43:43 15 | */ 16 | 17 | SET NAMES utf8mb4; 18 | SET FOREIGN_KEY_CHECKS = 0; 19 | 20 | -- ---------------------------- 21 | -- Table structure for app 22 | -- ---------------------------- 23 | DROP TABLE IF EXISTS `app`; 24 | CREATE TABLE `app` ( 25 | `app_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '应用id', 26 | `app_title` char(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '应用名称', 27 | `apk_name` char(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT 'apk包名', 28 | `app_link_id` int UNSIGNED NULL DEFAULT NULL COMMENT 'app详解链接地址id', 29 | `developer_id` mediumint UNSIGNED NOT NULL COMMENT '开发者id', 30 | `type_id` smallint UNSIGNED NOT NULL COMMENT 'app类型id', 31 | `market_id` tinyint UNSIGNED NOT NULL COMMENT '应用商城id', 32 | PRIMARY KEY (`app_id`) USING BTREE, 33 | UNIQUE INDEX `app_unique_index`(`apk_name`, `market_id`) USING BTREE COMMENT '唯一app由其平台和包名确定', 34 | INDEX `app.app_link_id`(`app_link_id`) USING BTREE, 35 | INDEX `app.type_id`(`type_id`) USING BTREE, 36 | INDEX `app.market_id`(`market_id`) USING BTREE, 37 | CONSTRAINT `app.app_link_id` FOREIGN KEY (`app_link_id`) REFERENCES `link` (`link_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 38 | CONSTRAINT `app.market_id` FOREIGN KEY (`market_id`) REFERENCES `market` (`market_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 39 | CONSTRAINT `app.type_id` FOREIGN KEY (`type_id`) REFERENCES `app_type` (`type_id`) ON DELETE RESTRICT ON UPDATE RESTRICT 40 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = 'app表' ROW_FORMAT = Dynamic; 41 | 42 | -- ---------------------------- 43 | -- Table structure for app_type 44 | -- ---------------------------- 45 | DROP TABLE IF EXISTS `app_type`; 46 | CREATE TABLE `app_type` ( 47 | `type_id` smallint UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '类型', 48 | `type_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '类型名称', 49 | PRIMARY KEY (`type_id`) USING BTREE, 50 | UNIQUE INDEX `name`(`type_name`) USING BTREE COMMENT '类型名称唯一' 51 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '应用类型表' ROW_FORMAT = DYNAMIC; 52 | 53 | -- ---------------------------- 54 | -- Table structure for authority 55 | -- ---------------------------- 56 | DROP TABLE IF EXISTS `authority`; 57 | CREATE TABLE `authority` ( 58 | `authority_id` smallint UNSIGNED NOT NULL AUTO_INCREMENT, 59 | `authority_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '权限名称', 60 | PRIMARY KEY (`authority_id`) USING BTREE, 61 | UNIQUE INDEX `authority_name`(`authority_name`) USING BTREE COMMENT '权限名称唯一' 62 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '权限名称表' ROW_FORMAT = Dynamic; 63 | 64 | -- ---------------------------- 65 | -- Table structure for authority_relation 66 | -- ---------------------------- 67 | DROP TABLE IF EXISTS `authority_relation`; 68 | CREATE TABLE `authority_relation` ( 69 | `authority_relation_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '权限关系id', 70 | `update_id` int UNSIGNED NOT NULL COMMENT 'update id', 71 | `authority_id` smallint UNSIGNED NOT NULL COMMENT 'authority id', 72 | PRIMARY KEY (`authority_relation_id`) USING BTREE, 73 | UNIQUE INDEX `unique_authority_relation`(`update_id`, `authority_id`) USING BTREE COMMENT 'update_id和authority_id唯一确定', 74 | INDEX `authority_relation_authority_id`(`authority_id`) USING BTREE, 75 | CONSTRAINT `authority_relation_authority_id` FOREIGN KEY (`authority_id`) REFERENCES `authority` (`authority_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 76 | CONSTRAINT `authority_relation_update_id` FOREIGN KEY (`update_id`) REFERENCES `update` (`update_id`) ON DELETE RESTRICT ON UPDATE RESTRICT 77 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = 'update 对应 authority的关系' ROW_FORMAT = Dynamic; 78 | 79 | -- ---------------------------- 80 | -- Table structure for developer 81 | -- ---------------------------- 82 | DROP TABLE IF EXISTS `developer`; 83 | CREATE TABLE `developer` ( 84 | `developer_id` mediumint UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '开发者id', 85 | `developer_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '开发者名称', 86 | PRIMARY KEY (`developer_id`) USING BTREE, 87 | UNIQUE INDEX `name`(`developer_name`) USING BTREE COMMENT '开发者名称唯一' 88 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '开发者表' ROW_FORMAT = Dynamic; 89 | 90 | -- ---------------------------- 91 | -- Table structure for image 92 | -- ---------------------------- 93 | DROP TABLE IF EXISTS `image`; 94 | CREATE TABLE `image` ( 95 | `image_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '图片id', 96 | `image_link_id` int UNSIGNED NOT NULL COMMENT '图片链接id', 97 | `update_id` int UNSIGNED NOT NULL COMMENT '更新id', 98 | `is_download` bit(1) NOT NULL DEFAULT b'0' COMMENT '是否下载', 99 | PRIMARY KEY (`image_id`) USING BTREE, 100 | UNIQUE INDEX `image_unique_index`(`image_link_id`, `update_id`) USING BTREE COMMENT '更新id和图片链接地址唯一确定一张图片', 101 | INDEX `image.update_id`(`update_id`) USING BTREE, 102 | CONSTRAINT `image.image_link_id` FOREIGN KEY (`image_link_id`) REFERENCES `link` (`link_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 103 | CONSTRAINT `image.update_id` FOREIGN KEY (`update_id`) REFERENCES `update` (`update_id`) ON DELETE RESTRICT ON UPDATE RESTRICT 104 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '图片表' ROW_FORMAT = Dynamic; 105 | 106 | -- ---------------------------- 107 | -- Table structure for link 108 | -- ---------------------------- 109 | DROP TABLE IF EXISTS `link`; 110 | CREATE TABLE `link` ( 111 | `link_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '链接id', 112 | `href` varchar(1023) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '网址', 113 | PRIMARY KEY (`link_id`) USING BTREE, 114 | UNIQUE INDEX `href`(`href`) USING BTREE COMMENT '网址唯一' 115 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '网页链接地址,采用不定长,独立分表优化update表的检索' ROW_FORMAT = Dynamic; 116 | 117 | -- ---------------------------- 118 | -- Table structure for market 119 | -- ---------------------------- 120 | DROP TABLE IF EXISTS `market`; 121 | CREATE TABLE `market` ( 122 | `market_id` tinyint UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '应用商城id', 123 | `market_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '应用商城名字', 124 | PRIMARY KEY (`market_id`) USING BTREE, 125 | UNIQUE INDEX `market_name_unique_index`(`market_name`) USING BTREE COMMENT '应用商城名字唯一确定一个应用商城' 126 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '应用商城表' ROW_FORMAT = DYNAMIC; 127 | 128 | -- ---------------------------- 129 | -- Table structure for update 130 | -- ---------------------------- 131 | DROP TABLE IF EXISTS `update`; 132 | CREATE TABLE `update` ( 133 | `update_id` int UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '更新id', 134 | `app_id` int UNSIGNED NOT NULL COMMENT '应用id', 135 | `version` char(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '版本号', 136 | `size` char(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '包大小', 137 | `download_link_id` int UNSIGNED NULL DEFAULT NULL COMMENT '下载链接id', 138 | `is_download` bit(1) NOT NULL DEFAULT b'0' COMMENT '是否下载', 139 | `apk_hash` binary(32) NULL DEFAULT NULL COMMENT 'apk sha256值', 140 | `malware` bit(1) NULL DEFAULT b'0' COMMENT '应用是否为恶意软件', 141 | `obfuscation` bit(1) NULL DEFAULT b'0' COMMENT '应用是否为加固混淆应用', 142 | `sdk_level` char(8) CHARACTER SET ascii COLLATE ascii_general_ci NULL DEFAULT NULL COMMENT 'sdk level', 143 | `update_date` datetime(0) NULL DEFAULT NULL COMMENT '更新日期', 144 | `is_delete` bit(1) NOT NULL DEFAULT b'0' COMMENT '删除标记', 145 | PRIMARY KEY (`update_id`) USING BTREE, 146 | UNIQUE INDEX `update_unique_index`(`app_id`, `version`) USING BTREE COMMENT 'app id和版本号唯一确定一个更新', 147 | INDEX `update.download_link_id`(`download_link_id`) USING BTREE, 148 | CONSTRAINT `update.app_id` FOREIGN KEY (`app_id`) REFERENCES `app` (`app_id`) ON DELETE RESTRICT ON UPDATE RESTRICT, 149 | CONSTRAINT `update.download_link_id` FOREIGN KEY (`download_link_id`) REFERENCES `link` (`link_id`) ON DELETE RESTRICT ON UPDATE RESTRICT 150 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '应用版本更新表' ROW_FORMAT = Dynamic; 151 | 152 | -- ---------------------------- 153 | -- Procedure structure for insert_app_from_file 154 | -- ---------------------------- 155 | DROP PROCEDURE IF EXISTS `insert_app_from_file`; 156 | delimiter ;; 157 | CREATE PROCEDURE `insert_app_from_file`(IN `title_in` char(255),IN `name_in` char(255),IN `developer_in` VARCHAR(255),IN `type_in` varchar(255),IN `market_in` varchar(255),IN `version_in` char(255),IN `size_in` char(20),IN `update_date_in` DATETIME,IN `apk_hash_in` CHAR(64)) 158 | BEGIN 159 | # declare local variables 160 | DECLARE local_market_id TINYINT UNSIGNED; 161 | DECLARE local_type_id SMALLINT UNSIGNED; 162 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 163 | DECLARE local_app_id INT UNSIGNED; 164 | 165 | # save the market 166 | INSERT IGNORE INTO market(market_name) VALUES(market_in); 167 | SELECT market_id INTO local_market_id FROM market WHERE market_name=market_in; 168 | 169 | # save the type 170 | INSERT IGNORE INTO app_type(type_name) VALUES(type_in); 171 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name=type_in; 172 | 173 | # save the developer 174 | INSERT IGNORE INTO developer(developer_name) VALUES(developer_in); 175 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name=developer_in; 176 | 177 | # save the app 178 | INSERT IGNORE INTO app(app_title, apk_name, developer_id, type_id, market_id) VALUES(title_in, name_in, local_developer_id, local_type_id, local_market_id); 179 | SELECT app_id INTO local_app_id FROM app WHERE apk_name=name_in AND market_id=local_market_id; 180 | 181 | # save the update 182 | INSERT IGNORE INTO `update`(app_id, version, size, is_download, apk_hash, update_date) VALUES(local_app_id, version_in, size_in, TRUE, UNHEX(apk_hash_in), update_date_in); 183 | END 184 | ;; 185 | delimiter ; 186 | 187 | -- ---------------------------- 188 | -- Procedure structure for insert_app_update 189 | -- ---------------------------- 190 | DROP PROCEDURE IF EXISTS `insert_app_update`; 191 | delimiter ;; 192 | CREATE PROCEDURE `insert_app_update`(IN `title_in` char(255),IN `name_in` char(255),IN `app_link_in` varchar(1023),IN `developer_in` VARCHAR(255),IN `type_in` varchar(255),IN `market_in` varchar(255),IN `version_in` char(255),IN `size_in` char(20),IN `download_link_in` varchar(1023),IN `update_date_in` DATETIME) 193 | BEGIN 194 | # declare local variables 195 | DECLARE local_app_link_id INT UNSIGNED; 196 | DECLARE local_download_link_id INT UNSIGNED; 197 | DECLARE local_market_id TINYINT UNSIGNED; 198 | DECLARE local_type_id SMALLINT UNSIGNED; 199 | DECLARE local_developer_id MEDIUMINT UNSIGNED; 200 | DECLARE local_app_id INT UNSIGNED; 201 | 202 | # save the link 203 | INSERT IGNORE INTO link(href) VALUES(app_link_in), (download_link_in); 204 | SELECT link_id INTO local_app_link_id FROM link WHERE href=app_link_in; 205 | SELECT link_id INTO local_download_link_id FROM link WHERE href=download_link_in; 206 | 207 | # save the market 208 | INSERT IGNORE INTO market(market_name) VALUES(market_in); 209 | SELECT market_id INTO local_market_id FROM market WHERE market_name=market_in; 210 | 211 | # save the type 212 | INSERT IGNORE INTO app_type(type_name) VALUES(type_in); 213 | SELECT type_id INTO local_type_id FROM app_type WHERE type_name=type_in; 214 | 215 | # save the developer 216 | INSERT IGNORE INTO developer(developer_name) VALUES(developer_in); 217 | SELECT developer_id INTO local_developer_id FROM developer WHERE developer_name=developer_in; 218 | 219 | # save the app 220 | INSERT IGNORE INTO app(app_title, apk_name, app_link_id, developer_id, type_id, market_id) VALUES(title_in, name_in, local_app_link_id, local_developer_id, local_type_id, local_market_id) 221 | ON DUPLICATE KEY 222 | UPDATE app_title=title_in, app_link_id=local_app_link_id, developer_id=local_developer_id, type_id=local_type_id; 223 | SELECT app_id INTO local_app_id FROM app WHERE apk_name=name_in AND market_id=local_market_id; 224 | 225 | # save the update 226 | INSERT IGNORE INTO `update`(app_id, version, size, download_link_id, update_date) VALUES(local_app_id, version_in, size_in, local_download_link_id, update_date_in) 227 | ON DUPLICATE KEY 228 | UPDATE size=size_in, download_link_id=local_download_link_id, update_date=update_date_in; 229 | SELECT update_id FROM `update` WHERE app_id=local_app_id AND version=version_in; 230 | END 231 | ;; 232 | delimiter ; 233 | 234 | -- ---------------------------- 235 | -- Procedure structure for insert_authority_relation 236 | -- ---------------------------- 237 | DROP PROCEDURE IF EXISTS `insert_authority_relation`; 238 | delimiter ;; 239 | CREATE PROCEDURE `insert_authority_relation`(IN `hash_in` CHAR(64),IN `authority_name_in` varchar(255)) 240 | BEGIN 241 | # declare local variables 242 | DECLARE local_update_id INT UNSIGNED; 243 | DECLARE local_authority_id SMALLINT UNSIGNED; 244 | DECLARE done INT DEFAULT 0; 245 | DECLARE report CURSOR FOR SELECT update_id FROM `update` WHERE apk_hash=UNHEX(hash_in); 246 | DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=1; 247 | 248 | # save the authority 249 | INSERT IGNORE INTO authority(authority_name) VALUES(authority_name_in); 250 | SELECT authority_id INTO local_authority_id FROM authority WHERE authority_name=authority_name_in; 251 | 252 | # get the update_id 253 | OPEN report; # open the cursor 254 | FETCH report INTO local_update_id; 255 | WHILE done<>1 DO 256 | INSERT IGNORE INTO authority_relation(update_id, authority_id) VALUES(local_update_id, local_authority_id); 257 | FETCH report INTO local_update_id; 258 | END WHILE; 259 | CLOSE report; # close the cursor 260 | 261 | END 262 | ;; 263 | delimiter ; 264 | 265 | -- ---------------------------- 266 | -- Procedure structure for insert_image 267 | -- ---------------------------- 268 | DROP PROCEDURE IF EXISTS `insert_image`; 269 | delimiter ;; 270 | CREATE PROCEDURE `insert_image`(IN `link_in` varchar(1023),IN `update_id_in` int unsigned) 271 | BEGIN 272 | # declare the local variables 273 | DECLARE local_link_id INT UNSIGNED; 274 | 275 | # save the link 276 | INSERT IGNORE INTO link(href) VALUES(link_in); 277 | SELECT link_id INTO local_link_id FROM link WHERE href=link_in; 278 | 279 | # save the image 280 | INSERT IGNORE INTO image(image_link_id, update_id) VALUES(local_link_id, update_id_in); 281 | SELECT image_id FROM image WHERE image_link_id=local_link_id AND update_id=update_id_in; 282 | 283 | END 284 | ;; 285 | delimiter ; 286 | 287 | -- ---------------------------- 288 | -- Procedure structure for set_image_available 289 | -- ---------------------------- 290 | DROP PROCEDURE IF EXISTS `set_image_available`; 291 | delimiter ;; 292 | CREATE PROCEDURE `set_image_available`(IN `image_id_in` int unsigned) 293 | BEGIN 294 | UPDATE image SET is_download=TRUE 295 | WHERE image_id=image_id_in; 296 | 297 | END 298 | ;; 299 | delimiter ; 300 | 301 | -- ---------------------------- 302 | -- Procedure structure for set_update_available 303 | -- ---------------------------- 304 | DROP PROCEDURE IF EXISTS `set_update_available`; 305 | delimiter ;; 306 | CREATE PROCEDURE `set_update_available`(IN `update_id_in` int unsigned, IN `size_in` char(20), IN `hash_in` char(64)) 307 | BEGIN 308 | UPDATE `update` SET is_download=TRUE, size=size_in, apk_hash=UNHEX(hash_in) 309 | WHERE update_id=update_id_in; 310 | END 311 | ;; 312 | delimiter ; 313 | 314 | SET FOREIGN_KEY_CHECKS = 1; 315 | -------------------------------------------------------------------------------- /codes/apks/database.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from typing import List 4 | 5 | import pymysql 6 | from pymysql.constants import FIELD_TYPE 7 | 8 | import settings as settings 9 | from items import AppDetail 10 | 11 | 12 | class Database(object): 13 | """ 14 | Database Operation Superclass 15 | """ 16 | 17 | def __init__(self): 18 | self.db_type = settings.DB_TYPE 19 | self.db_host = settings.DB_HOST 20 | self.db_database = settings.DB_DATABASE 21 | self.db_user = settings.DB_USER 22 | self.db_password = settings.DB_PASSWORD 23 | self.db_charset = settings.DB_CHARSET 24 | 25 | # init db connect 26 | self.db = None 27 | self.__connect__() 28 | 29 | self.logger = logging.getLogger("Database") 30 | 31 | def __connect__(self): 32 | """ 33 | connect to database 34 | """ 35 | orig_conv = pymysql.converters.conversions 36 | orig_conv[FIELD_TYPE.BIT] = lambda data: data == b'\x01' 37 | 38 | db_params = dict( 39 | host=self.db_host, 40 | db=self.db_database, 41 | user=self.db_user, 42 | passwd=self.db_password, 43 | charset=self.db_charset, 44 | cursorclass=pymysql.cursors.DictCursor, 45 | use_unicode=False, 46 | conv=orig_conv 47 | ) 48 | 49 | self.db = pymysql.connect(**db_params) 50 | 51 | def get_cursor(self): 52 | if not self.db or not self.db.open: 53 | self.__connect__() 54 | return self.db.cursor() 55 | 56 | def close(self): 57 | """ 58 | close connection 59 | """ 60 | if self.db: 61 | self.db.close() 62 | self.db = None 63 | 64 | def get_image_status(self, image_id): 65 | sql_str = "select is_download from `image` where image_id=%s;" 66 | cursor = self.get_cursor() 67 | cursor.execute( 68 | sql_str, 69 | (image_id,) 70 | ) 71 | result = cursor.fetchone() 72 | if result: 73 | return result['is_download'] 74 | else: 75 | return False 76 | 77 | def get_update_status(self, update_id): 78 | sql_str = "select is_download from `update` where update_id=%s;" 79 | cursor = self.get_cursor() 80 | cursor.execute( 81 | sql_str, 82 | (update_id,) 83 | ) 84 | result = cursor.fetchone() 85 | if result: 86 | return result['is_download'] 87 | else: 88 | return False 89 | 90 | def insert_app(self, item: AppDetail): 91 | """ 92 | insert information into database, and return whether the apk file is download 93 | """ 94 | try: 95 | cursor = self.get_cursor() 96 | cursor.callproc( 97 | "insert_app_update", 98 | (item['app_title'], item['apk_name'], item['app_link'], item['developer'], item['category'], item['market'], item['version'], item['size'], item['download_link'], item['update_date']) 99 | ) 100 | result = cursor.fetchone() 101 | if result: 102 | item['update_id'] = result['update_id'] 103 | else: 104 | raise pymysql.DatabaseError("Get update_id Error.") 105 | except pymysql.Error as _err: 106 | self.db.rollback() 107 | raise _err 108 | else: 109 | self.db.commit() 110 | 111 | for picture_link in item['picture_links']: 112 | try: 113 | cursor.callproc( 114 | "insert_image", 115 | (picture_link, item['update_id']) 116 | ) 117 | result = cursor.fetchone() 118 | if result: 119 | item['picture_link_ids'].append(result['image_id']) 120 | else: 121 | raise pymysql.DatabaseError("Get image_id Error.") 122 | except Exception as _err: 123 | self.db.rollback() 124 | raise _err 125 | else: 126 | self.db.commit() 127 | 128 | def set_update_available(self, update_id, file_size, file_hash): 129 | try: 130 | cursor = self.get_cursor() 131 | cursor.callproc( 132 | "set_update_available", 133 | (update_id, file_size, file_hash) 134 | ) 135 | except Exception as _err: 136 | self.db.rollback() 137 | raise _err 138 | else: 139 | self.db.commit() 140 | 141 | def set_image_available(self, image_id): 142 | try: 143 | cursor = self.get_cursor() 144 | cursor.callproc( 145 | "set_image_available", 146 | (image_id,) 147 | ) 148 | except Exception as _err: 149 | self.db.rollback() 150 | raise _err 151 | else: 152 | self.db.commit() 153 | 154 | def get_app_number(self, market_name: str): 155 | sql_str = "select count(*) " \ 156 | "from `app` a " \ 157 | "inner join `market` b on a.market_id=b.market_id " \ 158 | "where market_name=%s;" 159 | cursor = self.get_cursor() 160 | cursor.execute( 161 | sql_str, 162 | (market_name,) 163 | ) 164 | result = cursor.fetchone() 165 | if result: 166 | return result['count(*)'] 167 | else: 168 | raise ValueError("Get {} app number Error.".format(market_name)) 169 | 170 | def get_update_number(self, market_name: str): 171 | sql_str = "select count(*) " \ 172 | "from `app` a " \ 173 | "inner join `market` b on a.market_id=b.market_id " \ 174 | "inner join `update` c on a.app_id=c.app_id " \ 175 | "where market_name=%s;" 176 | cursor = self.get_cursor() 177 | cursor.execute( 178 | sql_str, 179 | (market_name,) 180 | ) 181 | result = cursor.fetchone() 182 | if result: 183 | return result['count(*)'] 184 | else: 185 | raise ValueError("Get {} update number Error.".format(market_name)) 186 | 187 | def get_available_update_number(self, market_name: str): 188 | sql_str = "select count(*) " \ 189 | "from `app` a " \ 190 | "inner join `market` b on a.market_id=b.market_id " \ 191 | "inner join `update` c on a.app_id=c.app_id " \ 192 | "where market_name=%s and is_download=TRUE;" 193 | cursor = self.get_cursor() 194 | cursor.execute( 195 | sql_str, 196 | (market_name,) 197 | ) 198 | result = cursor.fetchone() 199 | if result: 200 | return result['count(*)'] 201 | else: 202 | raise ValueError("Get {} available update number Error.".format(market_name)) 203 | 204 | def get_diff_type_update_number(self): 205 | sql_str = "select count(*), type_name " \ 206 | "from `app` a " \ 207 | "inner join `update` b on a.app_id=b.app_id " \ 208 | "inner join `app_type` c on a.type_id=c.type_id " \ 209 | "group by type_name;" 210 | cursor = self.get_cursor() 211 | cursor.execute( 212 | sql_str, 213 | ) 214 | results = cursor.fetchall() 215 | if results: 216 | type_names = [] 217 | type_counts = [] 218 | for result in results: 219 | count = result['count(*)'] 220 | type_name = result['type_name'].decode('utf-8') # type: str 221 | type_names.append(type_name) 222 | type_counts.append(count) 223 | return type_names, type_counts 224 | else: 225 | raise ValueError("Get different type update number Error.") 226 | 227 | def get_app_type(self, market_name: str, apk_name: str): 228 | sql_str = "select type_name " \ 229 | "from `app` a " \ 230 | "inner join `app_type` b on a.type_id=b.type_id " \ 231 | "inner join `market` c on a.market_id=c.market_id " \ 232 | "where market_name=%s and apk_name=%s;" 233 | cursor = self.get_cursor() 234 | cursor.execute( 235 | sql_str, 236 | (market_name, apk_name) 237 | ) 238 | result = cursor.fetchone() 239 | if result: 240 | return result['type_name'].decode('utf-8') # type: str 241 | else: 242 | raise ValueError("Get app type name Error.") 243 | 244 | def get_all_app_type(self): 245 | sql_str = "select type_id, type_name from app_type order by type_name;" 246 | cursor = self.get_cursor() 247 | cursor.execute( 248 | sql_str 249 | ) 250 | results = cursor.fetchall() 251 | type_data = [] 252 | if results: 253 | for result in results: 254 | type_data.append((result['type_id'], result['type_name'].decode('utf-8'))) 255 | else: 256 | raise ValueError('Get type Error.') 257 | 258 | return type_data 259 | 260 | def get_all_market(self): 261 | sql_str = "select market_id, market_name from market;" 262 | cursor = self.get_cursor() 263 | cursor.execute( 264 | sql_str 265 | ) 266 | results = cursor.fetchall() 267 | results = [] if not results else results 268 | for result in results: 269 | result.update({ 270 | "market_name": result['market_name'].decode('utf-8') 271 | }) 272 | 273 | return results 274 | 275 | def update_information(self, apk_hash: str, malware: bool = None, obfuscation: bool = None, sdk_level: str = None, authority_list: List[str] = None): 276 | """ 277 | 根据分析更新数据库中apk的信息 278 | :param apk_hash: apk hash, is a sha256 value of apk file. See the algorithm in utils.cal_file_hash. 279 | :param malware: whether the apk is a malware 280 | :param obfuscation: whether the apk uses the obfuscation 281 | :param sdk_level: sdk level 282 | :param authority_list: authority list with data like [, , ....] 283 | """ 284 | if apk_hash is None or (malware is None and obfuscation is None and sdk_level is None and authority_list is None and len(authority_list) == 0): 285 | logging.info("No information for apk (hash: {}) need to update.".format(apk_hash)) 286 | return 287 | if malware is not None or obfuscation is not None or sdk_level is not None: 288 | try: 289 | cursor = self.get_cursor() 290 | 291 | sql_str = "UPDATE `update` SET " 292 | params = [] 293 | # 生成sql字符串 294 | if malware is not None: 295 | sql_str += "`malware` = %s, " 296 | params.append(malware) 297 | if obfuscation is not None: 298 | sql_str += "`obfuscation` = %s, " 299 | params.append(obfuscation) 300 | if sdk_level is not None: 301 | sql_str += "`sdk_level` = %s, " 302 | params.append(sdk_level) 303 | sql_str = sql_str[:-2] + " where apk_hash = unhex(%s);" 304 | params.append(apk_hash) 305 | cursor.execute( 306 | sql_str, params 307 | ) 308 | except pymysql.Error as _err_: 309 | self.db.rollback() 310 | raise _err_ 311 | else: 312 | self.db.commit() 313 | 314 | if authority_list is not None: 315 | try: 316 | cursor = self.get_cursor() 317 | for authority in authority_list: 318 | cursor.callproc( 319 | "insert_authority_relation", 320 | (apk_hash, authority) 321 | ) 322 | except pymysql.Error as _err_: 323 | self.db.rollback() 324 | raise _err_ 325 | else: 326 | self.db.commit() 327 | 328 | def get_all_app(self, market_id): 329 | cursor = self.get_cursor() 330 | sql_str = "select app_id, app_title from app where market_id=%s;" 331 | cursor.execute(sql_str, (market_id,)) 332 | results = cursor.fetchall() 333 | results = [] if not results else results 334 | for result in results: 335 | result.update({ 336 | "app_title": result['app_title'].decode('utf-8') 337 | }) 338 | return results 339 | 340 | def get_all_updates(self, app_id): 341 | cursor = self.get_cursor() 342 | sql_str = "select update_id, version from `update` where app_id=%s and is_delete=0;" 343 | cursor.execute(sql_str, (app_id,)) 344 | results = cursor.fetchall() 345 | results = [] if not results else results 346 | for result in results: # type: dict 347 | result.update({ 348 | "version": result['version'].decode('utf-8') 349 | }) 350 | return results 351 | 352 | def get_information_by_update_id(self, update_id): 353 | cursor = self.get_cursor() 354 | sql_str = "select update_id, version, `size`, b.href as download_href, is_download, hex(apk_hash) as `hash`, malware, obfuscation, sdk_level, update_date, is_delete, app_title, apk_name, d.href as app_href, developer_name, type_name, market_name" \ 355 | " from `update` a" \ 356 | " left join link b on b.link_id=a.download_link_id" \ 357 | " left join app c on c.app_id=a.app_id" \ 358 | " left join link d on c.app_link_id=d.link_id" \ 359 | " left join developer e on e.developer_id=c.developer_id" \ 360 | " left join app_type f on f.type_id=c.type_id" \ 361 | " left join market g on g.market_id=c.market_id" \ 362 | " where update_id=%s;" 363 | cursor.execute(sql_str, (update_id,)) 364 | results = cursor.fetchall() 365 | results = [] if not results else results 366 | for result in results: # type: dict 367 | result.update({ 368 | "version": result['version'].decode('utf-8'), 369 | "size": result['size'].decode('utf-8') if result['size'] is not None else None, 370 | "download_href": result['download_href'].decode('utf-8') if result['download_href'] is not None else None, 371 | "hash": result['hash'].decode('utf-8') if result['hash'] is not None else None, 372 | "sdk_level": result['sdk_level'].decode('utf-8') if result['sdk_level'] is not None else None, 373 | "update_date": result['update_date'].strftime("%Y-%m-%d") if result['update_date'] is not None else None, 374 | "app_title": result['app_title'].decode('utf-8'), 375 | "apk_name": result['apk_name'].decode('utf-8'), 376 | "app_href": result['app_href'].decode('utf-8') if result['app_href'] is not None else None, 377 | "developer_name": result['developer_name'].decode('utf-8') if result['developer_name'] is not None else None, 378 | "type_name": result['type_name'].decode('utf-8'), 379 | "market_name": result['market_name'].decode('utf-8') 380 | }) 381 | return results 382 | 383 | def get_information_by_file_hash(self, file_hash): 384 | cursor = self.get_cursor() 385 | sql_str = "select update_id, version, `size`, b.href as download_href, is_download, hex(apk_hash) as `hash`, malware, obfuscation, sdk_level, update_date, is_delete, app_title, apk_name, d.href as app_href, developer_name, type_name, market_name" \ 386 | " from `update` a" \ 387 | " left join link b on b.link_id=a.download_link_id" \ 388 | " left join app c on c.app_id=a.app_id" \ 389 | " left join link d on c.app_link_id=d.link_id" \ 390 | " left join developer e on e.developer_id=c.developer_id" \ 391 | " left join app_type f on f.type_id=c.type_id" \ 392 | " left join market g on g.market_id=c.market_id" \ 393 | " where apk_hash=unhex(%s);" 394 | cursor.execute(sql_str, (file_hash,)) 395 | results = cursor.fetchall() 396 | results = [] if not results else results 397 | for result in results: # type: dict 398 | result.update({ 399 | "version": result['version'].decode('utf-8'), 400 | "size": result['size'].decode('utf-8') if result['size'] is not None else None, 401 | "download_href": result['download_href'].decode('utf-8') if result['download_href'] is not None else None, 402 | "hash": result['hash'].decode('utf-8') if result['hash'] is not None else None, 403 | "sdk_level": result['sdk_level'].decode('utf-8') if result['sdk_level'] is not None else None, 404 | "update_date": result['update_date'].strftime("%Y-%m-%d") if result['update_date'] is not None else None, 405 | "app_title": result['app_title'].decode('utf-8'), 406 | "apk_name": result['apk_name'].decode('utf-8'), 407 | "app_href": result['app_href'].decode('utf-8') if result['app_href'] is not None else None, 408 | "developer_name": result['developer_name'].decode('utf-8') if result['developer_name'] is not None else None, 409 | "type_name": result['type_name'].decode('utf-8'), 410 | "market_name": result['market_name'].decode('utf-8') 411 | }) 412 | return results 413 | 414 | def delete_apk_by_update_id(self, update_id): 415 | cursor = self.get_cursor() 416 | sql_str = "update `update` set is_delete=TRUE where update_id=%s;" 417 | try: 418 | cursor.execute(sql_str, (update_id,)) 419 | except pymysql.Error as _err_: 420 | self.db.rollback() 421 | raise _err_ 422 | else: 423 | self.db.commit() 424 | 425 | def delete_apk_by_hash(self, apk_hash): 426 | cursor = self.get_cursor() 427 | sql_str = "update `update` set is_delete=TRUE where apk_hash=unhex(%s);" 428 | try: 429 | cursor.execute(sql_str, (apk_hash,)) 430 | except pymysql.Error as _err_: 431 | self.db.rollback() 432 | raise _err_ 433 | else: 434 | self.db.commit() 435 | 436 | def get_all_sdk_level(self): 437 | cursor = self.get_cursor() 438 | sql_str = "select `sdk_level` from `update` group by sdk_level order by sdk_level;" 439 | cursor.execute(sql_str) 440 | results = cursor.fetchall() 441 | sdk_level_list = [] 442 | for result in results: 443 | if result and result['sdk_level']: 444 | sdk_level_list.append(result['sdk_level'].decode('utf-8')) 445 | return sdk_level_list 446 | 447 | def get_all_authority(self): 448 | cursor = self.get_cursor() 449 | sql_str = "select authority_id, `authority_name` from `authority` order by authority_name;" 450 | cursor.execute(sql_str) 451 | results = cursor.fetchall() 452 | authority_list = [] 453 | for result in results: 454 | if result and result['authority_name']: 455 | authority_list.append((result['authority_id'], result['authority_name'].decode('utf-8'))) 456 | return authority_list 457 | 458 | def search_platform_not_delete(self): 459 | cursor = self.get_cursor() 460 | sql_str = "select market_id, `market_name` from `update` join app using(app_id) join market using(market_id) where is_delete=FALSE group by market_id order by market_name;" 461 | cursor.execute(sql_str) 462 | results = cursor.fetchall() 463 | market_list = [] 464 | for result in results: 465 | if result and result['market_name']: 466 | market_list.append({ 467 | "market_id": result['market_id'], 468 | "market_name": result['market_name'].decode('utf-8') 469 | }) 470 | return market_list 471 | 472 | def search_app_not_delete(self, market_id): 473 | cursor = self.get_cursor() 474 | sql_str = "select app_id, app_title from `update` join app using(app_id) join market using(market_id) where is_delete=FALSE and market_id=%s group by app_id order by app_title;" 475 | cursor.execute(sql_str, (market_id,)) 476 | results = cursor.fetchall() 477 | app_list = [] 478 | for result in results: 479 | if result and result['app_title']: 480 | app_list.append({ 481 | "app_id": result['app_id'], 482 | "app_title": result['app_title'].decode('utf-8') 483 | }) 484 | return app_list 485 | 486 | def search_update_not_delete(self, app_id): 487 | cursor = self.get_cursor() 488 | sql_str = "select version, update_id from `update` join app using(app_id) where is_delete=FALSE and app_id=%s group by version;" 489 | cursor.execute(sql_str, (app_id,)) 490 | results = cursor.fetchall() 491 | update_list = [] 492 | for result in results: 493 | if result and result['version']: 494 | update_list.append({ 495 | "update_id": result['update_id'], 496 | "version": result['version'].decode('utf-8') 497 | }) 498 | return update_list 499 | 500 | def insert_app_from_file(self, market_name, apk_hash, app_title, apk_name, developer, app_type, version, size, update_date): 501 | cursor = self.get_cursor() 502 | # check the app whether in the market 503 | check_sql_str = "select update_id from `update` join app using(app_id) join market using(market_id) where market_name=%s and apk_hash=unhex(%s);" 504 | cursor.execute(check_sql_str, (market_name, apk_hash)) 505 | results = cursor.fetchall() 506 | if results and len(results) > 0: 507 | return False 508 | 509 | # insert app 510 | try: 511 | cursor.callproc( 512 | "insert_app_from_file", 513 | (app_title, apk_name, developer, app_type, market_name, version, size, update_date, apk_hash) 514 | ) 515 | except pymysql.err as _err_: 516 | self.db.rollback() 517 | raise _err_ 518 | else: 519 | self.db.commit() 520 | 521 | return True 522 | 523 | def get_local_update_info(self): 524 | cursor = self.get_cursor() 525 | sql_str = "select update_id, malware, obfuscation, sdk_level, is_delete from `update`;" 526 | cursor.execute(sql_str) 527 | results = cursor.fetchall() 528 | filtered_result = {} 529 | for result in results: 530 | if result and (result['malware'] or result['obfuscation'] or result['sdk_level'] or result['is_delete']): 531 | filtered_result[result['update_id']] = result 532 | return filtered_result 533 | 534 | def insert_local_update_info(self, update_info: dict): 535 | cursor = self.get_cursor() 536 | try: 537 | for update_id, item_info in enumerate(update_info): 538 | cursor.execute( 539 | "update `update` set malware=%s, obfuscation=%s, sdk_level=%s, is_delete=%s where update_id=%s;", 540 | (item_info['malware'], item_info['obfuscation'], item_info['sdk_level'], item_info['is_delete'], update_id) 541 | ) 542 | except Exception as _err: 543 | self.db.rollback() 544 | raise _err 545 | else: 546 | self.db.commit() 547 | 548 | def execute(self, sql_str: str): 549 | try: 550 | cursor = self.get_cursor() 551 | cursor.execute(sql_str) 552 | except Exception as _err: 553 | self.db.rollback() 554 | raise _err 555 | else: 556 | self.db.commit() 557 | -------------------------------------------------------------------------------- /codes/apks/main_gui.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import datetime 3 | import sys 4 | 5 | from custom_ui import * 6 | from PyQt5 import QtGui 7 | from PyQt5.QtGui import * 8 | from PyQt5.QtWidgets import * 9 | from crontab import CronTab 10 | from crontab import CronItem 11 | from ui_thread import * 12 | from pipelines.folder_path import get_app_folder 13 | from settings import python_interface 14 | 15 | __current_folder_path__ = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | 18 | class MainGUI(CrawlerGUI): 19 | # Main GUI for crawler module 20 | sdk_level_signal = QtCore.pyqtSignal(list) 21 | authority_signal = QtCore.pyqtSignal(list) 22 | type_signal = QtCore.pyqtSignal(list) 23 | scrapy_log_signal = pyqtSignal(str) 24 | scrapy_start_signal = pyqtSignal() 25 | scrapy_finish_signal = pyqtSignal() 26 | add_apk_signal = pyqtSignal(int, int, int) 27 | add_progress_signal = pyqtSignal(float) 28 | market_signal = pyqtSignal(list) 29 | app_signal = pyqtSignal(list) 30 | update_signal = pyqtSignal(list) 31 | update_information_signal = pyqtSignal(list) 32 | delete_apk_signal = pyqtSignal() 33 | delete_progress_signal = pyqtSignal(float) 34 | error_signal = pyqtSignal(str) 35 | 36 | market_model = None 37 | app_model = None 38 | update_model = None 39 | timer_model = None 40 | 41 | search_app_thread = None 42 | search_platform_thread = None 43 | search_update_thread = None 44 | search_update_info_thread = None 45 | 46 | inbox_update_id_list = [] 47 | timer_list = [] 48 | user_crontab = None 49 | 50 | crawler_list = ["fossdroid", "xiaomi", "apkpure", "github", "github_opensource"] 51 | 52 | def __init__(self): 53 | super(MainGUI, self).__init__() 54 | self.thread_pool = QThreadPool() 55 | self.thread_pool.globalInstance() 56 | self.thread_pool.setMaxThreadCount(8) 57 | 58 | self.bind_error() 59 | self.load_data() 60 | self.bind_scrapy() 61 | self.bind_add_apk() 62 | self.bind_search() 63 | self.bind_delete() 64 | self.bind_timer() 65 | self.check_value() 66 | 67 | """ 68 | 加载ComboBox数据 69 | """ 70 | 71 | def load_data(self): 72 | # crawler combobox 73 | self.crawler_combobox.addItems(self.crawler_list) 74 | 75 | # sdk level combobox 76 | sdk_thread = SDKLevelThread() 77 | sdk_thread.transfer(self) 78 | self.sdk_level_signal.connect(self.update_sdk) 79 | self.thread_pool.start(sdk_thread) 80 | 81 | # authority combobox 82 | authority_thread = AuthorityThread() 83 | authority_thread.transfer(self) 84 | self.authority_signal.connect(self.update_authority) 85 | self.thread_pool.start(authority_thread) 86 | 87 | # type combobox 88 | type_thread = TypeThread() 89 | type_thread.transfer(self) 90 | self.type_signal.connect(self.update_type) 91 | self.thread_pool.start(type_thread) 92 | 93 | def update_sdk(self, sdk_list): 94 | sdk_list = ["UNKNOWN"] + sdk_list 95 | self.sdk_list = sdk_list 96 | self.sdk_combobox.addItems(sdk_list) 97 | 98 | def update_authority(self, authority_list): 99 | if not authority_list: 100 | return 101 | authority_index_list, authority_name_list = zip(*authority_list) 102 | authority_name_list = list(authority_name_list) 103 | self.authority_id_list = list(authority_index_list) 104 | self.authority_combobox.addItems(authority_name_list) 105 | 106 | def update_type(self, type_list): 107 | if not type_list: 108 | return 109 | type_index_list, type_name_list = zip(*type_list) 110 | self.type_id_list = list(type_index_list) 111 | self.type_combobox.addItems(type_name_list) 112 | 113 | """ 114 | 设置scrapy 115 | """ 116 | 117 | def bind_scrapy(self): 118 | self.scrapy_worker = ScrapyWorker(self) 119 | self.scrapy_start_signal.connect(self.crawler_log_text.clear) 120 | self.scrapy_finish_signal.connect(self.scrapy_finish) 121 | self.scrapy_log_signal.connect(self.parse_log) 122 | self.start_crawl_button.clicked.connect(self.start_scrapy) 123 | self.stop_crawl_button.clicked.connect(self.stop_scrapy) 124 | 125 | def parse_log(self, text): 126 | pre_cursor = self.crawler_log_text.textCursor() 127 | self.crawler_log_text.moveCursor(QtGui.QTextCursor.End) 128 | self.crawler_log_text.insertPlainText(text) 129 | if self._keep_log_end_: 130 | pre_cursor.movePosition(QtGui.QTextCursor.End) 131 | self.crawler_log_text.setTextCursor(pre_cursor) 132 | 133 | def start_scrapy(self): 134 | if not self.check_value(): 135 | return 136 | 137 | platform = self.crawler_combobox.currentText() 138 | self.start_crawl_button.setVisible(False) 139 | self.stop_crawl_button.setVisible(True) 140 | self.scrapy_worker.run(platform) 141 | self._keep_log_end_ = True 142 | 143 | def stop_scrapy(self): 144 | self.scrapy_worker.stop() 145 | self.stop_crawl_button.setVisible(False) 146 | self.start_crawl_button.setVisible(True) 147 | 148 | def scrapy_finish(self): 149 | self.parse_log("\n\n\n Scrapy Worker Done!") 150 | self.start_crawl_button.setVisible(True) 151 | self.stop_crawl_button.setVisible(False) 152 | 153 | """ 154 | 批量添加APK 155 | """ 156 | 157 | def bind_add_apk(self): 158 | self.add_progress_signal.connect(self.add_apk_progress_bar.setValue) 159 | self.add_apk_signal.connect(self.add_apk_success) 160 | self.add_apk_button.clicked.connect(self.add_apk_button_click) 161 | 162 | def add_apk_button_click(self): 163 | dir_choose = QFileDialog.getExistingDirectory(self, "Choose APK Directory", os.path.join(__current_folder_path__, "../../")) 164 | if not dir_choose: 165 | return 166 | self.add_apk_button.setVisible(False) 167 | self.add_apk_progress_bar.setValue(0) 168 | self.add_apk_progress_bar.setVisible(True) 169 | add_apk_thread = AddAPKThread() 170 | add_apk_thread.transfer(self, dir_choose) 171 | self.thread_pool.start(add_apk_thread) 172 | 173 | def add_apk_success(self, success_number, repeated_number, error_number): 174 | QMessageBox.information(self, "Add APK", "Successfully add APKs. {} success, {} Repeated and {} error.".format(success_number, repeated_number, error_number), QMessageBox.Ok, QMessageBox.Ok) 175 | 176 | self.add_apk_button.setVisible(True) 177 | self.add_apk_progress_bar.setVisible(False) 178 | self.add_apk_progress_bar.setValue(0) 179 | 180 | """ 181 | 查找 182 | """ 183 | 184 | def bind_search(self): 185 | self.search_button.clicked.connect(self.search_click) 186 | 187 | self.market_signal.connect(self.update_market) 188 | self.first_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 189 | self.first_file_tree.clicked.connect(self.first_tree_click) 190 | 191 | self.app_signal.connect(self.update_app) 192 | self.second_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 193 | self.second_file_tree.clicked.connect(self.second_tree_click) 194 | 195 | self.update_signal.connect(self.update_update) 196 | self.third_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 197 | self.third_file_tree.clicked.connect(self.third_tree_click) 198 | 199 | self.update_information_signal.connect(self.update_information) 200 | 201 | self.bind_drag_search() 202 | 203 | def search_click(self): 204 | # get the combobox value 205 | sdk_name_list = self.sdk_combobox.get_select_text() 206 | authority_select_list = self.authority_combobox.get_select_index() 207 | app_type_select_list = self.type_combobox.get_select_index() 208 | 209 | # 选中全部sdk 或 未选中任何sdk时,无需筛选 210 | if len(sdk_name_list) == len(self.sdk_list) or len(sdk_name_list) == 0: 211 | sdk_name_list = None 212 | self.selected_sdk_name_list = sdk_name_list 213 | 214 | # 未选中任何authority时,无需筛选 215 | if len(authority_select_list) == 0: 216 | authority_id_list = None 217 | else: 218 | authority_id_list = [self.authority_id_list[_index_] for _index_ in authority_select_list] 219 | self.selected_authority_id_list = authority_id_list 220 | 221 | # 未选中任何type 或 选中全部type时, 无需筛选 222 | if len(app_type_select_list) == 0 or len(app_type_select_list) == len(self.type_id_list): 223 | type_id_list = None 224 | else: 225 | type_id_list = [self.type_id_list[_index_] for _index_ in app_type_select_list] 226 | self.selected_type_id_list = type_id_list 227 | 228 | search_platform_thread = SearchPlatformThread() 229 | search_platform_thread.transfer(self, sdk_name_list, authority_id_list, type_id_list) 230 | if self.search_platform_thread: 231 | # 取消上一个请求 232 | try: 233 | self.thread_pool.cancel(self.search_platform_thread) 234 | except RuntimeError: 235 | pass 236 | self.search_platform_thread = search_platform_thread 237 | self.thread_pool.start(search_platform_thread) 238 | 239 | def update_market(self, market_list): 240 | platform_model = QStandardItemModel() 241 | icon_path = os.path.join(__current_folder_path__, "./images/folder.png") 242 | for market in market_list: 243 | platform_model.appendRow(QStandardItem(QIcon(icon_path), market['market_name'])) 244 | self.market_list = market_list 245 | if self.market_model: 246 | self.market_model.deleteLater() 247 | self.market_model = platform_model 248 | self.first_file_tree.setModel(platform_model) 249 | self.first_file_tree.scrollTo(platform_model.index(0, 0)) 250 | 251 | # clear the second tree 252 | app_model = QStandardItemModel() 253 | self.app_list = [] 254 | if self.app_model: 255 | self.app_model.deleteLater() 256 | self.app_model = app_model 257 | self.second_file_tree.setModel(app_model) 258 | 259 | # clear the third tree 260 | update_model = QStandardItemModel() 261 | self.update_list = [] 262 | if self.update_model: 263 | self.update_model.deleteLater() 264 | self.update_model = update_model 265 | self.third_file_tree.setModel(update_model) 266 | 267 | # clear the information box 268 | self.clear_apk_info_layout() 269 | self.inbox_update_id_list = [] 270 | 271 | # not find any data 272 | if len(market_list) == 0: 273 | QMessageBox().warning(self, "Not Found", "Not found any apk in database.", QMessageBox.Ok) 274 | 275 | def first_tree_click(self): 276 | current_row_index = self.first_file_tree.currentIndex().row() 277 | 278 | search_app_thread = SearchAppThread() 279 | search_app_thread.transfer(self, self.market_list[current_row_index]['market_id'], self.selected_sdk_name_list, self.selected_authority_id_list, self.selected_type_id_list) 280 | if self.search_app_thread: 281 | # 取消上一个请求 282 | try: 283 | self.thread_pool.cancel(self.search_app_thread) 284 | except RuntimeError: 285 | pass 286 | self.search_app_thread = search_app_thread 287 | self.thread_pool.start(search_app_thread) 288 | 289 | def update_app(self, app_list): 290 | app_model = QStandardItemModel() 291 | icon_path = os.path.join(__current_folder_path__, "./images/android.png") 292 | for app in app_list: 293 | app_model.appendRow(QStandardItem(QIcon(icon_path), app['app_title'])) 294 | self.app_list = app_list 295 | if self.app_model: 296 | self.app_model.deleteLater() 297 | self.app_model = app_model 298 | self.second_file_tree.setModel(app_model) 299 | self.second_file_tree.scrollTo(app_model.index(0, 0)) 300 | 301 | def second_tree_click(self): 302 | current_row_index = self.second_file_tree.currentIndex().row() 303 | 304 | search_update_thread = SearchUpdateThread() 305 | search_update_thread.transfer(self, self.app_list[current_row_index]['app_id'], self.selected_sdk_name_list, self.selected_authority_id_list, self.selected_type_id_list) 306 | if self.search_update_thread: 307 | # 取消上一个请求 308 | try: 309 | self.thread_pool.cancel(self.search_update_thread) 310 | except RuntimeError: 311 | pass 312 | self.search_update_thread = search_update_thread 313 | self.thread_pool.start(search_update_thread) 314 | 315 | def update_update(self, update_list): 316 | update_model = QStandardItemModel() 317 | icon_path = os.path.join(__current_folder_path__, "./images/version.png") 318 | for update in update_list: 319 | version = update['version'].split('.apk')[0] if update['version'].endswith('.apk') else update['version'] 320 | version = update['version'].split('.xapk')[0] if update['version'].endswith('.xapk') else version 321 | update_model.appendRow(QStandardItem(QIcon(icon_path), version)) 322 | self.update_list = update_list 323 | if self.update_model: 324 | self.update_model.deleteLater() 325 | self.update_model = update_model 326 | self.third_file_tree.setModel(update_model) 327 | self.third_file_tree.scrollTo(update_model.index(0, 0)) 328 | 329 | def third_tree_click(self): 330 | current_third_tree_row_index = self.third_file_tree.currentIndex().row() 331 | 332 | search_apk_info_by_update_id_thread = SearchApkInfoByUpdateIdThread() 333 | search_apk_info_by_update_id_thread.transfer(self, self.update_list[current_third_tree_row_index]['update_id']) 334 | if self.search_update_info_thread: 335 | # 取消上一个请求 336 | try: 337 | self.thread_pool.cancel(self.search_update_info_thread) 338 | except RuntimeError: 339 | pass 340 | self.search_update_info_thread = search_apk_info_by_update_id_thread 341 | self.thread_pool.start(search_apk_info_by_update_id_thread) 342 | 343 | # clear the information box 344 | self.clear_apk_info_layout() 345 | self.inbox_update_id_list = [] 346 | 347 | def update_information(self, information_list): 348 | self.clear_apk_info_layout() 349 | inbox_update_id_list = [] 350 | for information in information_list: # add the new information widget 351 | information['market'] = information['market_name'] 352 | update_folder = get_app_folder(information) 353 | image_file_list = glob.glob(os.path.join(update_folder, "*.jpg")) 354 | description_file = os.path.join(update_folder, "description.txt") 355 | if os.path.exists(description_file): 356 | with open(description_file, 'r') as _file_: 357 | description = _file_.read().strip() 358 | if description != "": 359 | information['description'] = description 360 | information['image_file_list'] = image_file_list 361 | new_information_widget = InformationWidget() 362 | new_information_widget.load_data(information) 363 | self.apk_info_layout.addWidget(new_information_widget) 364 | inbox_update_id_list.append(information['update_id']) 365 | self.inbox_update_id_list = inbox_update_id_list 366 | 367 | """ 368 | 拖动查找 369 | """ 370 | 371 | def bind_drag_search(self): 372 | self.apk_info_scroll_area.file_signal.connect(self.drag_search) 373 | 374 | def drag_search(self, file_url): 375 | drag_search_thread = DragSearchThread() 376 | drag_search_thread.transfer(self, file_url) 377 | if self.search_app_thread: 378 | # 取消上一个进程 379 | try: 380 | self.thread_pool.cancel(self.search_app_thread) 381 | except RuntimeError: 382 | pass 383 | self.search_app_thread = drag_search_thread 384 | self.thread_pool.start(drag_search_thread) 385 | 386 | # uncheck the third tree 387 | row_index = self.third_file_tree.currentIndex().row() 388 | if row_index != -1: 389 | self.third_file_tree.setCurrentIndex(self.update_model.index(-1, -1)) 390 | 391 | # clear the information box 392 | self.clear_apk_info_layout() 393 | self.inbox_update_id_list = [] 394 | 395 | """ 396 | 删除 397 | """ 398 | 399 | def bind_delete(self): 400 | self.delete_apk_button.clicked.connect(self.delete_apk_button_click) 401 | self.delete_apk_signal.connect(self.delete_apk_success) 402 | self.delete_from_folder_button.clicked.connect(self.delete_from_folder_button_click) 403 | 404 | def delete_apk_button_click(self): 405 | if not self.inbox_update_id_list: 406 | return 407 | 408 | # clear the information box 409 | self.clear_apk_info_layout() 410 | inbox_update_id_list = self.inbox_update_id_list 411 | self.inbox_update_id_list = [] 412 | 413 | # check the third file system tree 414 | in_third_tree = False 415 | reserved_update_list = [] # type: List[Dict] 416 | for update in self.update_list: 417 | if update['update_id'] in inbox_update_id_list: 418 | in_third_tree = True 419 | else: 420 | reserved_update_list.append(update) 421 | if in_third_tree: 422 | new_update_model = QStandardItemModel() 423 | icon_path = os.path.join(__current_folder_path__, "./images/version.png") 424 | for update in reserved_update_list: 425 | version = update['version'].split('.apk')[0] if update['version'].endswith('.apk') else update['version'] 426 | version = update['version'].split('.xapk')[0] if update['version'].endswith('.xapk') else version 427 | new_update_model.appendRow(QStandardItem(QIcon(icon_path), version)) 428 | self.update_list = reserved_update_list 429 | if self.update_model: 430 | self.update_model.deleteLater() 431 | self.update_model = new_update_model 432 | self.third_file_tree.setModel(new_update_model) 433 | self.third_file_tree.scrollTo(new_update_model.index(0, 0)) 434 | 435 | # start the delete thread 436 | delete_apk_thread = DeleteApkThread() 437 | delete_apk_thread.transfer(self, inbox_update_id_list) 438 | self.thread_pool.start(delete_apk_thread) 439 | 440 | def delete_apk_success(self): 441 | QMessageBox.information(self, "Delete successfully", "Successfully delete APK(s).", QMessageBox.Yes) 442 | 443 | self.delete_from_folder_button.setVisible(True) 444 | self.delete_progress_bar.setValue(0) 445 | self.delete_progress_bar.setVisible(False) 446 | 447 | def delete_from_folder_button_click(self): 448 | dir_choose = QFileDialog.getExistingDirectory(self, "Choose APK Directory", os.path.join(__current_folder_path__, "../../")) 449 | if not dir_choose: 450 | return 451 | 452 | user_choose = QMessageBox.question(self, "Delete Confirm", "Do confirm to delete folder '{}'?.".format(dir_choose), QMessageBox.Yes | QMessageBox.Cancel, QMessageBox.Cancel) 453 | if user_choose == QMessageBox.Cancel: 454 | return 455 | 456 | # set the layout 457 | self.delete_from_folder_button.setVisible(False) 458 | self.delete_progress_bar.setValue(0) 459 | self.delete_progress_bar.setVisible(True) 460 | 461 | # start the delete thread 462 | delete_thread = MultiDeleteThread() 463 | delete_thread.transfer(self, dir_choose) 464 | self.thread_pool.start(delete_thread) 465 | 466 | """ 467 | 定时器 468 | """ 469 | 470 | def bind_timer(self): 471 | self.timer_window = TimerGUI() 472 | self.timer_window.load_crawler(self.crawler_list) 473 | self.timer_window.setWindowModality(QtCore.Qt.ApplicationModal) 474 | self.timer_window.timer_signal.connect(self.add_new_timer) 475 | self.add_timer_button.clicked.connect(self.add_timer_button_click) 476 | self.delete_timer_button.clicked.connect(self.delete_timer_button_click) 477 | 478 | self.update_timer() 479 | 480 | def add_timer_button_click(self): 481 | if not self.check_value(): 482 | return 483 | 484 | self.timer_window.reset_edit() 485 | self.timer_window.show() 486 | self.timer_window.exec_() 487 | 488 | def add_new_timer(self, month, day, hour, minute, crawler_name): 489 | if month == -1 and day == -1 and hour == -1 and minute == -1: 490 | return 491 | 492 | if month == -1: 493 | month = "*" 494 | if day == -1: 495 | day = "*" 496 | if hour == -1: 497 | hour = "*" 498 | if minute == -1: 499 | minute = "*" 500 | 501 | if platform.system() == "Windows": 502 | print("Not imply in windows.") 503 | else: 504 | crawler_script_path = os.path.join(__current_folder_path__, "main.py") 505 | crontab_command = "{} {} --market_name {}".format(python_interface, crawler_script_path, crawler_name) 506 | crontab_time = "{} {} {} {} *".format(minute, hour, day, month) 507 | comment = "apk crawler job" 508 | user_crontab = CronTab(user=True) 509 | job = user_crontab.new(command=crontab_command, comment=comment) 510 | job.setall(crontab_time) 511 | job.enable() 512 | user_crontab.write() 513 | 514 | self.update_timer() 515 | 516 | def update_timer(self): 517 | if platform.system() == "Windows": 518 | print("Not imply in windows.") 519 | else: 520 | user_crontab = CronTab(user=True) # todo: 检查一下 521 | job_iter = user_crontab.find_comment("apk crawler job") 522 | self.user_crontab = user_crontab 523 | self.timer_list = list(job_iter) 524 | timer_data_list = [] 525 | for job in self.timer_list: # type: CronItem 526 | month, day, hour, minute = job.month, job.dom, job.hour, job.minute 527 | crawler = job.command.split('--market_name')[1].strip() 528 | timer_data_list.append([month, day, hour, minute, crawler]) 529 | 530 | self.timer_table_widget.clear() 531 | self.timer_table_widget.setRowCount(len(timer_data_list)) 532 | _row_ = 0 533 | for timer_data in timer_data_list: 534 | for _column_ in range(self.timer_table_widget.columnCount()): 535 | q_table_widget_item = QTableWidgetItem(str(timer_data[_column_])) 536 | q_table_widget_item.setTextAlignment(QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter) 537 | self.timer_table_widget.setItem(_row_, _column_, q_table_widget_item) 538 | _row_ += 1 539 | 540 | def delete_timer_button_click(self): 541 | timer_index = self.timer_table_widget.currentIndex().row() 542 | if timer_index == -1: 543 | return 544 | 545 | timer_job = self.timer_list[timer_index] # type: CronItem 546 | self.user_crontab.remove(timer_job) 547 | self.user_crontab.write() 548 | self.update_timer() 549 | 550 | """ 551 | 错误 552 | """ 553 | 554 | def bind_error(self): 555 | self.error_signal.connect(self.catch_error) 556 | 557 | def catch_error(self, _err_: str): 558 | log_file = os.path.join(__current_folder_path__, "../../log/main_gui.{}.log".format(datetime.datetime.now().strftime("%Y-%m-%d-%H"))) 559 | with open(log_file, 'a') as _file_: 560 | _file_.write(_err_) 561 | 562 | def check_value(self): 563 | enviro = True 564 | if python_interface is None: 565 | QMessageBox.warning(self, "Python Interface Error", "Please set the 'python_interface' in setting.py.", QMessageBox.Ok, QMessageBox.Ok) 566 | enviro = False 567 | return enviro 568 | 569 | 570 | if __name__ == '__main__': 571 | app = QApplication(sys.argv) 572 | crawler_gui = MainGUI() 573 | crawler_gui.show() 574 | sys.exit(app.exec_()) 575 | -------------------------------------------------------------------------------- /codes/apks/data_gui.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import datetime 3 | import platform 4 | import sys 5 | 6 | from PyQt5.QtCore import QThreadPool 7 | from PyQt5.QtGui import * 8 | from PyQt5.QtWidgets import * 9 | 10 | from pipelines.folder_path import get_app_folder 11 | from settings import ProgressBarStyleSheet 12 | from ui_thread import * 13 | from custom_ui import InformationWidget 14 | 15 | from custom_ui import ComboCheckBox, DragScrollArea 16 | 17 | if platform.system() == "Windows": 18 | from PyQt5 import sip 19 | else: 20 | import sip 21 | 22 | __current_folder_path__ = os.path.dirname(os.path.abspath(__file__)) 23 | BUTTON_HEIGHT = 25 24 | 25 | 26 | class DataGUI(QWidget): 27 | # Main GUI for data module. 28 | 29 | def __init__(self): 30 | super(DataGUI, self).__init__() 31 | self.layout_init() 32 | 33 | def layout_init(self): 34 | # 整体大小 35 | height = 600 36 | width = 1400 37 | self.setFixedSize(width, height) 38 | self.setWindowTitle("Data GUI") 39 | 40 | # 设置字体 41 | self.setFont(QFont("Microsoft Yahei", 8.5)) 42 | 43 | # set layout 44 | root_layout = QVBoxLayout() 45 | body_widget = QGroupBox(title="Search Condition") 46 | bottom_widget = QGroupBox(title="File Database") 47 | root_layout.addWidget(body_widget) 48 | root_layout.addWidget(bottom_widget) 49 | root_layout.setStretch(0, 1) 50 | root_layout.setStretch(1, 15) 51 | 52 | # save the value 53 | self.body_widget = body_widget 54 | self.bottom_widget = bottom_widget 55 | 56 | # init the sub layout 57 | self.body_widget_init() 58 | self.bottom_widget_init() 59 | 60 | # show 61 | self.setLayout(root_layout) 62 | 63 | def body_widget_init(self): 64 | # set the layout 65 | search_condition_layout = QHBoxLayout() 66 | sdk_level_layout = QVBoxLayout() 67 | authority_layout = QVBoxLayout() 68 | type_layout = QVBoxLayout() 69 | search_button_layout = QVBoxLayout() 70 | add_apk_layout = QVBoxLayout() 71 | 72 | search_condition_layout.addLayout(sdk_level_layout) 73 | search_condition_layout.addSpacing(20) 74 | search_condition_layout.addLayout(authority_layout) 75 | search_condition_layout.addSpacing(20) 76 | search_condition_layout.addLayout(type_layout) 77 | search_condition_layout.addSpacing(40) 78 | search_condition_layout.addLayout(search_button_layout) 79 | search_condition_layout.addStretch() 80 | search_condition_layout.addLayout(add_apk_layout) 81 | self.body_widget.setLayout(search_condition_layout) 82 | 83 | # save the value 84 | self.search_condition_layout = search_condition_layout 85 | self.sdk_level_layout = sdk_level_layout 86 | self.authority_layout = authority_layout 87 | self.type_layout = type_layout 88 | self.search_button_layout = search_button_layout 89 | self.add_apk_layout = add_apk_layout 90 | 91 | # init the sub layout 92 | self.sdk_level_layout_init() 93 | self.authority_layout_init() 94 | self.type_layout_init() 95 | self.search_button_layout_init() 96 | self.add_apk_layout_init() 97 | 98 | def add_apk_layout_init(self): 99 | # set layout 100 | add_apk_button = QPushButton(QIcon(os.path.join(__current_folder_path__, "./images/folder_import.png")), "Import APK From Folder") 101 | add_apk_button.setFixedHeight(BUTTON_HEIGHT) 102 | add_apk_progress_bar = QProgressBar() 103 | add_apk_progress_bar.setObjectName("BlueProgressBar") 104 | add_apk_progress_bar.setStyleSheet(ProgressBarStyleSheet) 105 | add_apk_progress_bar.setVisible(False) 106 | self.add_apk_layout.addWidget(add_apk_button) 107 | self.add_apk_layout.addWidget(add_apk_progress_bar) 108 | self.add_apk_layout.addStretch() 109 | 110 | # save the value 111 | self.add_apk_button = add_apk_button 112 | self.add_apk_progress_bar = add_apk_progress_bar 113 | 114 | def sdk_level_layout_init(self): 115 | # set the layout 116 | sdk_label = QLabel("SDK Level :") 117 | sdk_combobox = ComboCheckBox() 118 | sdk_combobox.setFixedHeight(BUTTON_HEIGHT) 119 | self.sdk_level_layout.addWidget(sdk_label) 120 | self.sdk_level_layout.addWidget(sdk_combobox) 121 | 122 | # save the value 123 | self.sdk_label = sdk_label 124 | self.sdk_combobox = sdk_combobox 125 | 126 | def authority_layout_init(self): 127 | # set the layout 128 | authority_label = QLabel("Permission : ") 129 | authority_combobox = ComboCheckBox() 130 | authority_combobox.setFixedHeight(BUTTON_HEIGHT) 131 | self.authority_layout.addWidget(authority_label) 132 | self.authority_layout.addWidget(authority_combobox) 133 | 134 | # save the value 135 | self.authority_label = authority_label 136 | self.authority_combobox = authority_combobox 137 | 138 | def type_layout_init(self): 139 | # set the layout 140 | type_label = QLabel("APP Type : ") 141 | type_combobox = ComboCheckBox() 142 | type_combobox.setFixedHeight(BUTTON_HEIGHT) 143 | type_combobox.setFixedWidth(180) 144 | self.type_layout.addWidget(type_label) 145 | self.type_layout.addWidget(type_combobox) 146 | 147 | # save the value 148 | self.type_label = type_label 149 | self.type_combobox = type_combobox 150 | 151 | def search_button_layout_init(self): 152 | # set the layout 153 | search_button = QPushButton( 154 | QIcon(os.path.join(__current_folder_path__, "./images/search.png")), "Search" 155 | ) 156 | search_button.setFixedHeight(BUTTON_HEIGHT) 157 | self.search_button_layout.addStretch() 158 | self.search_button_layout.addWidget(search_button) 159 | 160 | # save the value 161 | self.search_button = search_button 162 | 163 | def bottom_widget_init(self): 164 | # set layout 165 | file_system_layout = QHBoxLayout() 166 | first_file_tree = QListView() 167 | second_file_tree = QListView() 168 | third_file_layout = QVBoxLayout() 169 | apk_info_scroll_area = DragScrollArea() 170 | apk_info_scroll_area.setVerticalScrollBarPolicy(QtCore.Qt.ScrollBarAlwaysOff) 171 | file_system_layout.addWidget(first_file_tree) 172 | file_system_layout.addWidget(second_file_tree) 173 | file_system_layout.addLayout(third_file_layout) 174 | file_system_layout.addWidget(apk_info_scroll_area) 175 | file_system_layout.setStretch(0, 2) 176 | file_system_layout.setStretch(1, 8) 177 | file_system_layout.setStretch(2, 3) 178 | file_system_layout.setStretch(3, 12) 179 | self.bottom_widget.setLayout(file_system_layout) 180 | 181 | # save the value 182 | self.file_system_layout = file_system_layout 183 | self.first_file_tree = first_file_tree 184 | self.second_file_tree = second_file_tree 185 | self.third_file_layout = third_file_layout 186 | self.apk_info_scroll_area = apk_info_scroll_area 187 | 188 | # init the sub layout 189 | self.third_file_layout_init() 190 | self.apk_info_scroll_area_init() 191 | 192 | def third_file_layout_init(self): 193 | # set layout 194 | third_file_tree = QListView() 195 | delete_apk_button = QPushButton("Delete APK") 196 | delete_apk_button.setFixedHeight(BUTTON_HEIGHT) 197 | delete_from_folder_button = QPushButton("Delete APKs From Folder") 198 | delete_progress_bar = QProgressBar() 199 | delete_progress_bar.setObjectName("RedProgressBar") 200 | delete_progress_bar.setStyleSheet(ProgressBarStyleSheet) 201 | delete_progress_bar.setVisible(False) 202 | delete_from_folder_button.setFixedHeight(BUTTON_HEIGHT) 203 | self.third_file_layout.addWidget(third_file_tree) 204 | self.third_file_layout.addWidget(delete_apk_button) 205 | self.third_file_layout.addWidget(delete_from_folder_button) 206 | self.third_file_layout.addWidget(delete_progress_bar) 207 | 208 | # save the value 209 | self.third_file_tree = third_file_tree 210 | self.delete_apk_button = delete_apk_button 211 | self.delete_from_folder_button = delete_from_folder_button 212 | self.delete_progress_bar = delete_progress_bar 213 | 214 | def apk_info_scroll_area_init(self): 215 | # set the layout 216 | apk_info_widget = QWidget() 217 | apk_info_layout = QVBoxLayout() 218 | apk_info_widget.setLayout(apk_info_layout) 219 | widget_style = QPalette() 220 | widget_style.setColor(QPalette.Background, QtCore.Qt.white) 221 | apk_info_widget.setPalette(widget_style) 222 | apk_info_widget.setContentsMargins(0, 0, 0, 0) 223 | self.apk_info_scroll_area.setWidget(apk_info_widget) 224 | self.apk_info_scroll_area.setWidgetResizable(True) 225 | 226 | # save the value 227 | self.apk_info_widget = apk_info_widget 228 | self.apk_info_layout = apk_info_layout 229 | 230 | def clear_apk_info_layout(self): 231 | for _i_ in range(self.apk_info_layout.count()): 232 | tmp_widget = self.apk_info_layout.itemAt(_i_).widget() 233 | tmp_widget.deleteLater() 234 | sip.delete(tmp_widget) 235 | 236 | 237 | class DataProcess(DataGUI): 238 | # Main GUI for data module 239 | sdk_level_signal = QtCore.pyqtSignal(list) 240 | authority_signal = QtCore.pyqtSignal(list) 241 | type_signal = QtCore.pyqtSignal(list) 242 | add_apk_signal = pyqtSignal(int, int, int) 243 | add_progress_signal = pyqtSignal(float) 244 | market_signal = pyqtSignal(list) 245 | app_signal = pyqtSignal(list) 246 | update_signal = pyqtSignal(list) 247 | update_information_signal = pyqtSignal(list) 248 | delete_apk_signal = pyqtSignal() 249 | delete_progress_signal = pyqtSignal(float) 250 | error_signal = pyqtSignal(str) 251 | 252 | market_model = None 253 | app_model = None 254 | update_model = None 255 | 256 | search_app_thread = None 257 | search_platform_thread = None 258 | search_update_thread = None 259 | search_update_info_thread = None 260 | 261 | inbox_update_id_list = [] 262 | 263 | def __init__(self): 264 | super(DataProcess, self).__init__() 265 | self.thread_pool = QThreadPool() 266 | self.thread_pool.globalInstance() 267 | self.thread_pool.setMaxThreadCount(8) 268 | 269 | self.bind_error() 270 | self.load_data() 271 | self.bind_add_apk() 272 | self.bind_search() 273 | self.bind_delete() 274 | self.check_value() 275 | 276 | """ 277 | 加载ComboBox数据 278 | """ 279 | 280 | def load_data(self): 281 | # sdk level combobox 282 | sdk_thread = SDKLevelThread() 283 | sdk_thread.transfer(self) 284 | self.sdk_level_signal.connect(self.update_sdk) 285 | self.thread_pool.start(sdk_thread) 286 | 287 | # authority combobox 288 | authority_thread = AuthorityThread() 289 | authority_thread.transfer(self) 290 | self.authority_signal.connect(self.update_authority) 291 | self.thread_pool.start(authority_thread) 292 | 293 | # type combobox 294 | type_thread = TypeThread() 295 | type_thread.transfer(self) 296 | self.type_signal.connect(self.update_type) 297 | self.thread_pool.start(type_thread) 298 | 299 | def update_sdk(self, sdk_list): 300 | sdk_list = ["UNKNOWN"] + sdk_list 301 | self.sdk_list = sdk_list 302 | self.sdk_combobox.addItems(sdk_list) 303 | 304 | def update_authority(self, authority_list): 305 | if not authority_list: 306 | return 307 | authority_index_list, authority_name_list = zip(*authority_list) 308 | authority_name_list = list(authority_name_list) 309 | self.authority_id_list = list(authority_index_list) 310 | self.authority_combobox.addItems(authority_name_list) 311 | 312 | def update_type(self, type_list): 313 | if not type_list: 314 | return 315 | type_index_list, type_name_list = zip(*type_list) 316 | self.type_id_list = list(type_index_list) 317 | self.type_combobox.addItems(type_name_list) 318 | 319 | """ 320 | 查找 321 | """ 322 | 323 | def bind_search(self): 324 | self.search_button.clicked.connect(self.search_click) 325 | 326 | self.market_signal.connect(self.update_market) 327 | self.first_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 328 | self.first_file_tree.clicked.connect(self.first_tree_click) 329 | 330 | self.app_signal.connect(self.update_app) 331 | self.second_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 332 | self.second_file_tree.clicked.connect(self.second_tree_click) 333 | 334 | self.update_signal.connect(self.update_update) 335 | self.third_file_tree.setEditTriggers(QAbstractItemView.NoEditTriggers) 336 | self.third_file_tree.clicked.connect(self.third_tree_click) 337 | 338 | self.update_information_signal.connect(self.update_information) 339 | 340 | self.bind_drag_search() 341 | 342 | def search_click(self): 343 | # get the combobox value 344 | sdk_name_list = self.sdk_combobox.get_select_text() 345 | authority_select_list = self.authority_combobox.get_select_index() 346 | app_type_select_list = self.type_combobox.get_select_index() 347 | 348 | # 选中全部sdk 或 未选中任何sdk时,无需筛选 349 | if len(sdk_name_list) == len(self.sdk_list) or len(sdk_name_list) == 0: 350 | sdk_name_list = None 351 | self.selected_sdk_name_list = sdk_name_list 352 | 353 | # 未选中任何authority时,无需筛选 354 | if len(authority_select_list) == 0: 355 | authority_id_list = None 356 | else: 357 | authority_id_list = [self.authority_id_list[_index_] for _index_ in authority_select_list] 358 | self.selected_authority_id_list = authority_id_list 359 | 360 | # 未选中任何type 或 选中全部type时, 无需筛选 361 | if len(app_type_select_list) == 0 or len(app_type_select_list) == len(self.type_id_list): 362 | type_id_list = None 363 | else: 364 | type_id_list = [self.type_id_list[_index_] for _index_ in app_type_select_list] 365 | self.selected_type_id_list = type_id_list 366 | 367 | search_platform_thread = SearchPlatformThread() 368 | search_platform_thread.transfer(self, sdk_name_list, authority_id_list, type_id_list) 369 | if self.search_platform_thread: 370 | # 取消上一个请求 371 | try: 372 | self.thread_pool.cancel(self.search_platform_thread) 373 | except RuntimeError: 374 | pass 375 | self.search_platform_thread = search_platform_thread 376 | self.thread_pool.start(search_platform_thread) 377 | 378 | def update_market(self, market_list): 379 | platform_model = QStandardItemModel() 380 | icon_path = os.path.join(__current_folder_path__, "./images/folder.png") 381 | for market in market_list: 382 | platform_model.appendRow(QStandardItem(QIcon(icon_path), market['market_name'])) 383 | self.market_list = market_list 384 | if self.market_model: 385 | self.market_model.deleteLater() 386 | self.market_model = platform_model 387 | self.first_file_tree.setModel(platform_model) 388 | self.first_file_tree.scrollTo(platform_model.index(0, 0)) 389 | 390 | # clear the second tree 391 | app_model = QStandardItemModel() 392 | self.app_list = [] 393 | if self.app_model: 394 | self.app_model.deleteLater() 395 | self.app_model = app_model 396 | self.second_file_tree.setModel(app_model) 397 | 398 | # clear the third tree 399 | update_model = QStandardItemModel() 400 | self.update_list = [] 401 | if self.update_model: 402 | self.update_model.deleteLater() 403 | self.update_model = update_model 404 | self.third_file_tree.setModel(update_model) 405 | 406 | # clear the information box 407 | self.clear_apk_info_layout() 408 | self.inbox_update_id_list = [] 409 | 410 | # not find any data 411 | if len(market_list) == 0: 412 | QMessageBox().warning(self, "Not Found", "Not found any apk in database.", QMessageBox.Ok) 413 | 414 | def first_tree_click(self): 415 | current_row_index = self.first_file_tree.currentIndex().row() 416 | 417 | search_app_thread = SearchAppThread() 418 | search_app_thread.transfer(self, self.market_list[current_row_index]['market_id'], self.selected_sdk_name_list, self.selected_authority_id_list, self.selected_type_id_list) 419 | if self.search_app_thread: 420 | # 取消上一个请求 421 | try: 422 | self.thread_pool.cancel(self.search_app_thread) 423 | except RuntimeError: 424 | pass 425 | self.search_app_thread = search_app_thread 426 | self.thread_pool.start(search_app_thread) 427 | 428 | def update_app(self, app_list): 429 | app_model = QStandardItemModel() 430 | icon_path = os.path.join(__current_folder_path__, "./images/android.png") 431 | for app in app_list: 432 | app_model.appendRow(QStandardItem(QIcon(icon_path), app['app_title'])) 433 | self.app_list = app_list 434 | if self.app_model: 435 | self.app_model.deleteLater() 436 | self.app_model = app_model 437 | self.second_file_tree.setModel(app_model) 438 | self.second_file_tree.scrollTo(app_model.index(0, 0)) 439 | 440 | def second_tree_click(self): 441 | current_row_index = self.second_file_tree.currentIndex().row() 442 | 443 | search_update_thread = SearchUpdateThread() 444 | search_update_thread.transfer(self, self.app_list[current_row_index]['app_id'], self.selected_sdk_name_list, self.selected_authority_id_list, self.selected_type_id_list) 445 | if self.search_update_thread: 446 | # 取消上一个请求 447 | try: 448 | self.thread_pool.cancel(self.search_update_thread) 449 | except RuntimeError: 450 | pass 451 | self.search_update_thread = search_update_thread 452 | self.thread_pool.start(search_update_thread) 453 | 454 | def update_update(self, update_list): 455 | update_model = QStandardItemModel() 456 | icon_path = os.path.join(__current_folder_path__, "./images/version.png") 457 | for update in update_list: 458 | version = update['version'].split('.apk')[0] if update['version'].endswith('.apk') else update['version'] 459 | version = update['version'].split('.xapk')[0] if update['version'].endswith('.xapk') else version 460 | update_model.appendRow(QStandardItem(QIcon(icon_path), version)) 461 | self.update_list = update_list 462 | if self.update_model: 463 | self.update_model.deleteLater() 464 | self.update_model = update_model 465 | self.third_file_tree.setModel(update_model) 466 | self.third_file_tree.scrollTo(update_model.index(0, 0)) 467 | 468 | def third_tree_click(self): 469 | current_third_tree_row_index = self.third_file_tree.currentIndex().row() 470 | 471 | search_apk_info_by_update_id_thread = SearchApkInfoByUpdateIdThread() 472 | search_apk_info_by_update_id_thread.transfer(self, self.update_list[current_third_tree_row_index]['update_id']) 473 | if self.search_update_info_thread: 474 | # 取消上一个请求 475 | try: 476 | self.thread_pool.cancel(self.search_update_info_thread) 477 | except RuntimeError: 478 | pass 479 | self.search_update_info_thread = search_apk_info_by_update_id_thread 480 | self.thread_pool.start(search_apk_info_by_update_id_thread) 481 | 482 | # clear the information box 483 | self.clear_apk_info_layout() 484 | self.inbox_update_id_list = [] 485 | 486 | def update_information(self, information_list): 487 | self.clear_apk_info_layout() 488 | inbox_update_id_list = [] 489 | for information in information_list: # add the new information widget 490 | information['market'] = information['market_name'] 491 | update_folder = get_app_folder(information) 492 | image_file_list = glob.glob(os.path.join(update_folder, "*.jpg")) 493 | description_file = os.path.join(update_folder, "description.txt") 494 | if os.path.exists(description_file): 495 | with open(description_file, 'r') as _file_: 496 | description = _file_.read().strip() 497 | if description != "": 498 | information['description'] = description 499 | information['image_file_list'] = image_file_list 500 | new_information_widget = InformationWidget() 501 | new_information_widget.load_data(information) 502 | self.apk_info_layout.addWidget(new_information_widget) 503 | inbox_update_id_list.append(information['update_id']) 504 | self.inbox_update_id_list = inbox_update_id_list 505 | 506 | """ 507 | 拖动查找 508 | """ 509 | 510 | def bind_drag_search(self): 511 | self.apk_info_scroll_area.file_signal.connect(self.drag_search) 512 | 513 | def drag_search(self, file_url): 514 | drag_search_thread = DragSearchThread() 515 | drag_search_thread.transfer(self, file_url) 516 | if self.search_app_thread: 517 | # 取消上一个进程 518 | try: 519 | self.thread_pool.cancel(self.search_app_thread) 520 | except RuntimeError: 521 | pass 522 | self.search_app_thread = drag_search_thread 523 | self.thread_pool.start(drag_search_thread) 524 | 525 | # uncheck the third tree 526 | row_index = self.third_file_tree.currentIndex().row() 527 | if row_index != -1: 528 | self.third_file_tree.setCurrentIndex(self.update_model.index(-1, -1)) 529 | 530 | # clear the information box 531 | self.clear_apk_info_layout() 532 | self.inbox_update_id_list = [] 533 | 534 | """ 535 | 删除 536 | """ 537 | 538 | def bind_delete(self): 539 | self.delete_apk_button.clicked.connect(self.delete_apk_button_click) 540 | self.delete_apk_signal.connect(self.delete_apk_success) 541 | self.delete_from_folder_button.clicked.connect(self.delete_from_folder_button_click) 542 | 543 | def delete_apk_button_click(self): 544 | if not self.inbox_update_id_list: 545 | return 546 | 547 | # clear the information box 548 | self.clear_apk_info_layout() 549 | inbox_update_id_list = self.inbox_update_id_list 550 | self.inbox_update_id_list = [] 551 | 552 | # check the third file system tree 553 | in_third_tree = False 554 | reserved_update_list = [] # type: List[Dict] 555 | for update in self.update_list: 556 | if update['update_id'] in inbox_update_id_list: 557 | in_third_tree = True 558 | else: 559 | reserved_update_list.append(update) 560 | if in_third_tree: 561 | new_update_model = QStandardItemModel() 562 | icon_path = os.path.join(__current_folder_path__, "./images/version.png") 563 | for update in reserved_update_list: 564 | version = update['version'].split('.apk')[0] if update['version'].endswith('.apk') else update['version'] 565 | version = update['version'].split('.xapk')[0] if update['version'].endswith('.xapk') else version 566 | new_update_model.appendRow(QStandardItem(QIcon(icon_path), version)) 567 | self.update_list = reserved_update_list 568 | if self.update_model: 569 | self.update_model.deleteLater() 570 | self.update_model = new_update_model 571 | self.third_file_tree.setModel(new_update_model) 572 | self.third_file_tree.scrollTo(new_update_model.index(0, 0)) 573 | 574 | # start the delete thread 575 | delete_apk_thread = DeleteApkThread() 576 | delete_apk_thread.transfer(self, inbox_update_id_list) 577 | self.thread_pool.start(delete_apk_thread) 578 | 579 | def delete_apk_success(self): 580 | QMessageBox.information(self, "Delete successfully", "Successfully delete APK(s).", QMessageBox.Yes) 581 | 582 | self.delete_from_folder_button.setVisible(True) 583 | self.delete_progress_bar.setValue(0) 584 | self.delete_progress_bar.setVisible(False) 585 | 586 | def delete_from_folder_button_click(self): 587 | dir_choose = QFileDialog.getExistingDirectory(self, "Choose APK Directory", os.path.join(__current_folder_path__, "../../")) 588 | if not dir_choose: 589 | return 590 | 591 | user_choose = QMessageBox.question(self, "Delete Confirm", "Do confirm to delete folder '{}'?.".format(dir_choose), QMessageBox.Yes | QMessageBox.Cancel, QMessageBox.Cancel) 592 | if user_choose == QMessageBox.Cancel: 593 | return 594 | 595 | # set the layout 596 | self.delete_from_folder_button.setVisible(False) 597 | self.delete_progress_bar.setValue(0) 598 | self.delete_progress_bar.setVisible(True) 599 | 600 | # start the delete thread 601 | delete_thread = MultiDeleteThread() 602 | delete_thread.transfer(self, dir_choose) 603 | self.thread_pool.start(delete_thread) 604 | 605 | """ 606 | 错误 607 | """ 608 | 609 | def bind_error(self): 610 | self.error_signal.connect(self.catch_error) 611 | 612 | def catch_error(self, _err_: str): 613 | log_file = os.path.join(__current_folder_path__, "../../log/main_gui.{}.log".format(datetime.datetime.now().strftime("%Y-%m-%d-%H"))) 614 | with open(log_file, 'a') as _file_: 615 | _file_.write(_err_) 616 | 617 | def check_value(self): 618 | enviro = True 619 | if python_interface is None: 620 | QMessageBox.warning(self, "Python Interface Error", "Please set the 'python_interface' in setting.py.", QMessageBox.Ok, QMessageBox.Ok) 621 | enviro = False 622 | return enviro 623 | 624 | """ 625 | 批量添加APK 626 | """ 627 | 628 | def bind_add_apk(self): 629 | self.add_progress_signal.connect(self.add_apk_progress_bar.setValue) 630 | self.add_apk_signal.connect(self.add_apk_success) 631 | self.add_apk_button.clicked.connect(self.add_apk_button_click) 632 | 633 | def add_apk_button_click(self): 634 | dir_choose = QFileDialog.getExistingDirectory(self, "Choose APK Directory", os.path.join(__current_folder_path__, "../../")) 635 | if not dir_choose: 636 | return 637 | self.add_apk_button.setVisible(False) 638 | self.add_apk_progress_bar.setValue(0) 639 | self.add_apk_progress_bar.setVisible(True) 640 | add_apk_thread = AddAPKThread() 641 | add_apk_thread.transfer(self, dir_choose) 642 | self.thread_pool.start(add_apk_thread) 643 | 644 | def add_apk_success(self, success_number, repeated_number, error_number): 645 | QMessageBox.information(self, "Add APK", "Successfully add APKs. {} success, {} Repeated and {} error.".format(success_number, repeated_number, error_number), QMessageBox.Ok, QMessageBox.Ok) 646 | 647 | self.add_apk_button.setVisible(True) 648 | self.add_apk_progress_bar.setVisible(False) 649 | self.add_apk_progress_bar.setValue(0) 650 | 651 | 652 | if __name__ == '__main__': 653 | app = QApplication(sys.argv) 654 | data_gui = DataProcess() 655 | data_gui.show() 656 | sys.exit(app.exec_()) 657 | --------------------------------------------------------------------------------