├── .gitignore ├── README.md ├── crawler ├── android_apps_crawler │ ├── __init__.py │ ├── custom_parser.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── android_apps_spider.py ├── crawl.sh └── scrapy.cfg ├── downloader └── downloader.py └── repo ├── apps └── README.md └── databases └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .scrapy 3 | repo/databases 4 | repo/apps 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Android Apps Crawler 2 | ==================== 3 | 4 | Overview 5 | -------- 6 | Android Apps Crawler is an extensible crawler for downloading Android applications in the third-party markets. 7 | It can crawl the download url addresses of applications and automatically download applications 8 | into repository. 9 | 10 | Requirements 11 | ------------ 12 | * Python 2.6 or up 13 | * Scrapy 0.22 or up: http://scrapy.org (didn't have a full test in lower version.) 14 | * Works on Linux, Windows, Mac OSX, BSD 15 | * Currently, downloader cannot work on Windows. 16 | * For Ubuntu users, "Don't use the python-scrapy package provided by Ubuntu, 17 | they are typically too old and slow to catch up with latest Scrapy. 18 | Instead, use the official [Ubuntu Packages](http://doc.scrapy.org/en/latest/topics/ubuntu.html#topics-ubuntu)." 19 | 20 | Usage 21 | ----- 22 | * Set the third-party markets you want to crawl in settings.py 23 | * Set the proxy if you have 24 | * Start crawler: 25 | ``` 26 | ./crawl.sh 27 | ``` 28 | * Start downloader: 29 | ``` 30 | ./downloader.py 31 | ``` 32 | 33 | Settings 34 | -------- 35 | You can set proxy, user-agen, database name, etc in ```crawler/android_apps_crawler/settings.py``` file. 36 | 37 | Supported Third-party Markets (market names used in crawl.sh) 38 | ----------------------------- 39 | * AppChina: http://www.appchina.com (appchina.com) 40 | * Hiapk: http://apk.hiapk.com (hiapk.com) 41 | * Anzhi: http://www.anzhi.com (anzhi.com) 42 | * android.d.cn: http://android.d.cn (android.d.cn) 43 | * mumayi: http://www.mumayi.com (mumayi.com) 44 | * gfan: http://apk.gfan.com (gfan.com) 45 | * nduoa: http://www.nduoa.com (nduoa.com) 46 | * 3gyu: http://www.3gyu.com (3gyu.com) 47 | * angeeks: http://apk.angeeks.com (angeeks.com) 48 | * appfun: http://www.appfun.cn (appfun.cn) 49 | * jimi168: http://www.jimi168.com (jimi168.com) 50 | * Keep adding... 51 | 52 | More Android Markets 53 | -------------------- 54 | See: https://github.com/mssun/android-markets-list 55 | 56 | TODO 57 | ---- 58 | * Windows support for downloader. 59 | * Crawl apps from shared cloud storage link (e.g, pan.baidu.com, dbank.com). 60 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mssun/android-apps-crawler/77a88c31e93fcf50fc24e01f27911622214fc820/crawler/android_apps_crawler/__init__.py -------------------------------------------------------------------------------- /crawler/android_apps_crawler/custom_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrapy.selector import Selector 4 | from android_apps_crawler.items import AppItem 5 | 6 | def parse_anzhi(response): 7 | xpath = "//div[@class='detail_down']/a/@onclick" 8 | appItemList = [] 9 | sel = Selector(response) 10 | for script in sel.xpath(xpath).extract(): 11 | id = re.search(r"\d+", script).group() 12 | url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) 13 | appItem = AppItem() 14 | appItem['url'] = url 15 | appItemList.append(appItem) 16 | return appItemList 17 | 18 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/items.py: -------------------------------------------------------------------------------- 1 | from scrapy.item import Item, Field 2 | 3 | class AppItem(Item): 4 | url = Field() 5 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy.exceptions import IgnoreRequest 2 | 3 | class DownloaderMiddleware(object): 4 | def process_request(self, request, spider): 5 | if (spider.settings['PROXIES']): 6 | request.meta["proxy"] = spider.settings['PROXIES']['http'] 7 | if request.url[-3:].lower() in ["apk", "png", "jpg", "exe", "doc", 8 | "zip", "rar"]: 9 | print "Ignore request!" 10 | raise IgnoreRequest 11 | 12 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | from scrapy.xlib.pydispatch import dispatcher 3 | from scrapy import log 4 | 5 | import sqlite3 6 | from os import path 7 | from android_apps_crawler import settings 8 | 9 | 10 | class AppPipeline(object): 11 | def process_item(self, item, spider): 12 | log.msg("Catch an AppItem", level=log.INFO) 13 | return item 14 | 15 | class SQLitePipeline(object): 16 | filename = '' 17 | conn = None 18 | def __init__(self): 19 | self.filename += settings.MARKET_NAME 20 | self.filename += ".db" 21 | self.filename = path.join(settings.DATABASE_DIR, self.filename) 22 | print self.filename 23 | self.conn = None 24 | dispatcher.connect(self.initialize, signals.engine_started) 25 | dispatcher.connect(self.finalize, signals.engine_stopped) 26 | 27 | def process_item(self, item, spider): 28 | try: 29 | self.conn.execute('insert into apps(url) values(?)', 30 | (item['url'],) 31 | ) 32 | self.conn.commit() 33 | log.msg("Inserting into database"); 34 | except sqlite3.IntegrityError: 35 | print "Duplicated" 36 | return item 37 | 38 | def initialize(self): 39 | if path.exists(self.filename): 40 | self.conn = sqlite3.connect(self.filename) 41 | else: 42 | self.create_table() 43 | self.conn.execute("PRAGMA journal_mode=WAL;") 44 | self.conn.commit() 45 | 46 | def finalize(self): 47 | if self.conn is not None: 48 | self.conn.commit() 49 | self.conn.close() 50 | self.conn = None 51 | 52 | def create_table(self): 53 | self.conn = sqlite3.connect(self.filename) 54 | self.conn.execute("create table apps( \ 55 | id integer primary key autoincrement, \ 56 | url varchar(100) not null unique, \ 57 | downloaded int default 0)" 58 | ) 59 | self.conn.commit() 60 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for android_apps_crawler project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'android_apps_crawler' 10 | 11 | SPIDER_MODULES = ['android_apps_crawler.spiders'] 12 | NEWSPIDER_MODULE = 'android_apps_crawler.spiders' 13 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" 14 | ITEM_PIPELINES = { 15 | 'android_apps_crawler.pipelines.AppPipeline': 1, 16 | 'android_apps_crawler.pipelines.SQLitePipeline': 2, 17 | } 18 | LOG_LEVEL = 'INFO' 19 | DOWNLOADER_MIDDLEWARES = { 20 | 'android_apps_crawler.middlewares.DownloaderMiddleware': 1, 21 | } 22 | 23 | # Uncomment following statement to use proxy. 24 | # PROXIES = { 25 | # 'http' : '', 26 | # } 27 | 28 | ALLOWED_DOMAINS = { 29 | "appchina.com" : ["appchina.com",], 30 | "hiapk.com" : ["hiapk.com",], 31 | "anzhi.com" : ["anzhi.com",], 32 | "android.d.cn" : ["android.d.cn",], 33 | "mumayi.com" : ["mumayi.com",], 34 | "gfan.com" : ["apk.gfan.com",], 35 | "nduoa.com" : ["nduoa.com",], 36 | "3gyu.com" : ["3gyu.com",], 37 | "angeeks.com" : ["angeeks.com",], 38 | "appfun.cn" : ["appfun.cn",], 39 | "jimi168.com" : ["jimi168.com",], 40 | "7723.com" : ["7723.com",], 41 | "777ccc.com" : ["777ccc.com",], 42 | "anruan.com" : ["anruan.com",], 43 | } 44 | START_URLS = { 45 | "appchina.com" : ["http://www.appchina.com",], 46 | "hiapk.com" : ["http://apk.hiapk.com",], 47 | "anzhi.com" : ["http://www.anzhi.com",], 48 | "android.d.cn" : ["http://android.d.cn",], 49 | "mumayi.com" : ["http://www.mumayi.com",], 50 | "gfan.com" : ["http://apk.gfan.com",], 51 | "nduoa.com" : ["http://www.nduoa.com",], 52 | "3gyu.com" : ["http://www.3gyu.com",], 53 | "angeeks.com" : ["http://www.angeeks.com",], 54 | "appfun.cn" : ["http://www.appfun.cn",], 55 | "jimi168.com" : ["http://www.jimi168.com/",], 56 | "7723.com" : ["http://www.7723.com",], 57 | "777ccc.com" : ["http://www.777ccc.com",], 58 | "anruan.com" : ["http://www.anruan.com",], 59 | } 60 | SCRAPE_RULES = { 61 | "xpath" : { 62 | "appchina" : "//a[@class='download-pc fl']/@href", 63 | "hiapk" : "//a[@class='link_btn']/@href", 64 | "android.d.cn" : "//a[@class='localDownload']/@href", 65 | "mumayi" : "//a[@class='download fl']/@href", 66 | "gfan" : "//a[@id='computerLoad']/@href", 67 | "nduoa" : "//a[@class='d_pc_normal']/@href", 68 | "3gyu" : "//a[@class='ldownload']/@href", 69 | "angeeks" : "//div[@class='rgmainsrimg'][1]/a/@href", 70 | "appfun" : "//a[@class='downcp']/@href", 71 | "jimi168" : "//a[@class='a_sign2']/@href", 72 | "7723" : "//ul[@class='download_list']/li/h5/a/@href", 73 | "777ccc" : "//a[@class='downtopc']/@href", 74 | "anruan" : "//a[@class='ldownload']/@href", 75 | }, 76 | "custom_parser" : { 77 | "anzhi" : "parse_anzhi", 78 | }, 79 | } 80 | DATABASE_DIR = "../repo/databases/" 81 | MARKET_NAME = "" 82 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # To create the first spider for your project use this command: 4 | # 5 | # scrapy genspider myspider myspider-domain.com 6 | # 7 | # For more info see: 8 | # http://doc.scrapy.org/topics/spiders.html 9 | -------------------------------------------------------------------------------- /crawler/android_apps_crawler/spiders/android_apps_spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrapy.spider import Spider 4 | from scrapy.selector import Selector 5 | from scrapy.http import Request 6 | from scrapy.http import HtmlResponse 7 | from scrapy import log 8 | 9 | from urlparse import urlparse 10 | from urlparse import urljoin 11 | 12 | from android_apps_crawler.items import AppItem 13 | from android_apps_crawler import settings 14 | from android_apps_crawler import custom_parser 15 | 16 | 17 | class AndroidAppsSpider(Spider): 18 | name = "android_apps_spider" 19 | scrape_rules = settings.SCRAPE_RULES 20 | 21 | def __init__(self, market=None, database_dir="../repo/databases/", *args, **kwargs): 22 | super(AndroidAppsSpider, self).__init__(*args, **kwargs) 23 | self.allowed_domains = settings.ALLOWED_DOMAINS[market] 24 | self.start_urls = settings.START_URLS[market] 25 | settings.MARKET_NAME = market 26 | settings.DATABASE_DIR = database_dir 27 | 28 | def parse(self, response): 29 | response_domain = urlparse(response.url).netloc 30 | appItemList = [] 31 | cookie = {} 32 | xpath_rule = self.scrape_rules['xpath'] 33 | for key in xpath_rule.keys(): 34 | if key in response_domain: 35 | appItemList.extend( 36 | self.parse_xpath(response, xpath_rule[key])) 37 | break 38 | custom_parser_rule = self.scrape_rules['custom_parser'] 39 | for key in custom_parser_rule.keys(): 40 | if key in response_domain: 41 | appItemList.extend( 42 | getattr(custom_parser, custom_parser_rule[key])(response)) 43 | break 44 | #if "appchina" in response_domain: 45 | # xpath = "//a[@id='pc-download' and @class='free']/@href" 46 | # appItemList.extend(self.parse_xpath(response, xpath)) 47 | #elif "hiapk" in response_domain: 48 | # xpath = "//a[@class='linkbtn d1']/@href" 49 | # appItemList.extend(self.parse_xpath(response, xpath)) 50 | #elif "android.d.cn" in response_domain: 51 | # xpath = "//a[@class='down']/@href" 52 | # appItemList.extend(self.parse_xpath(response, xpath)) 53 | #elif "anzhi" in response_domain: 54 | # xpath = "//div[@id='btn']/a/@onclick" 55 | # appItemList.extend(self.parse_anzhi(response, xpath)) 56 | #else: 57 | # pass 58 | sel = Selector(response) 59 | for url in sel.xpath('//a/@href').extract(): 60 | url = urljoin(response.url, url) 61 | yield Request(url, meta=cookie, callback=self.parse) 62 | 63 | for item in appItemList: 64 | yield item 65 | 66 | 67 | #def parse_appchina(self, response): 68 | # appItemList = [] 69 | # hxs = HtmlXPathSelector(response) 70 | # for url in hxs.select( 71 | # "//a[@id='pc-download' and @class='free']/@href" 72 | # ).extract(): 73 | # url = urljoin(response.url, url) 74 | # log.msg("Catch an application: %s" % url, level=log.INFO) 75 | # appItem = AppItem() 76 | # appItem['url'] = url 77 | # appItemList.append(appItem) 78 | # return appItemList 79 | 80 | def parse_xpath(self, response, xpath): 81 | appItemList = [] 82 | sel = Selector(response) 83 | for url in sel.xpath(xpath).extract(): 84 | url = urljoin(response.url, url) 85 | log.msg("Catch an application: %s" % url, level=log.INFO) 86 | appItem = AppItem() 87 | appItem['url'] = url 88 | appItemList.append(appItem) 89 | return appItemList 90 | 91 | #def parse_anzhi(self, response, xpath): 92 | # appItemList = [] 93 | # hxs = HtmlXPathSelector(response) 94 | # for script in hxs.select(xpath).extract(): 95 | # id = re.search(r"\d+", script).group() 96 | # url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) 97 | # appItem = AppItem() 98 | # appItem['url'] = url 99 | # appItemList.append(appItem) 100 | # return appItemList 101 | 102 | 103 | -------------------------------------------------------------------------------- /crawler/crawl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ $# -le 0 ] 3 | then 4 | echo "Usage: $0 [database output directory]" 5 | echo " market name:" 6 | echo " appchina.com" 7 | echo " hiapk.com" 8 | echo " anzhi.com" 9 | echo " android.d.cn" 10 | echo " mumayi.com " 11 | echo " gfan.com" 12 | echo " nduoa.com" 13 | echo " 3gyu.com" 14 | echo " angeeks.com" 15 | echo " appfun.cn" 16 | echo " jimi168.com" 17 | echo " database output directory:" 18 | echo " default: ../repo/databases/" 19 | exit 2 20 | fi 21 | 22 | if [ $# -eq 1 ] 23 | then 24 | scrapy crawl android_apps_spider -s JOBDIR=job_$1 -a market=$1 25 | else 26 | scrapy crawl android_apps_spider -s JOBDIR=job_$1 -a market=$1 -a database_dir=$2 27 | fi 28 | -------------------------------------------------------------------------------- /crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = android_apps_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = android_apps_crawler 12 | -------------------------------------------------------------------------------- /downloader/downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | import sys 4 | import threading 5 | import sqlite3 6 | import time 7 | import urllib2 8 | import os 9 | import hashlib 10 | import signal 11 | import Queue 12 | 13 | NUM_THREAD = 20 14 | work_queue_lock = threading.Lock() 15 | update_database_lock = threading.Lock() 16 | 17 | class Downloader(threading.Thread): 18 | def __init__(self, work_queue, output_dir, database_filepath): 19 | threading.Thread.__init__(self) 20 | self.exit_event = threading.Event() 21 | self.work_queue = work_queue 22 | self.proxies = None 23 | #Define proxy below 24 | #self.proxies = {"http": ""} 25 | self.output_dir = output_dir 26 | self.current_file_size = 0 27 | self.file_size = 0 28 | self.database_filepath = database_filepath 29 | 30 | def exit(self): 31 | print("%s: asked to exit." % self.getName()) 32 | self.exit_event.set() 33 | self.join() 34 | return self.report() 35 | 36 | def report(self): 37 | if self.file_size == 0: 38 | return 0 39 | return float(self.current_file_size) / self.file_size 40 | 41 | def run(self): 42 | while not self.exit_event.isSet(): 43 | work_queue_lock.acquire() 44 | if not self.work_queue.empty(): 45 | self.url = self.work_queue.get() 46 | work_queue_lock.release() 47 | try: 48 | self.download() 49 | self.save() 50 | self.update_database() 51 | except urllib2.HTTPError: 52 | self.update_database(-1) 53 | else: 54 | work_queue_lock.release() 55 | print("%s: received exit event." % self.getName()) 56 | 57 | def download(self): 58 | print("%s: downloading %s" % (self.getName(), self.url)) 59 | self.current_file_size = 0 60 | self.file_size = 0 61 | proxy_handler = urllib2.ProxyHandler() 62 | if (self.proxies): 63 | proxy_handler = urllib2.proxyHandler(self.proxies); 64 | opener = urllib2.build_opener(proxy_handler) 65 | opener.addheaders = [ 66 | ('User-Agent', r"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 " 67 | "(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"), 68 | ('Referer', self.url) 69 | ] 70 | urllib2.install_opener(opener) 71 | opening = urllib2.urlopen(self.url) 72 | meta = opening.info() 73 | self.file_size = int(meta.getheaders("Content-Length")[0]) 74 | temp_file_name = "%d.apk" % (time.time() * 1000000) 75 | temp_dir = self.output_dir + os.sep + "temp" 76 | self.temp_output_path = temp_dir + os.sep + temp_file_name 77 | with open(self.temp_output_path, 'wb') as fil: 78 | block_size = 10240 79 | while True: 80 | buf = opening.read(block_size) 81 | self.current_file_size += len(buf) 82 | fil.write(buf) 83 | if not buf: 84 | break 85 | 86 | def save(self): 87 | with open(self.temp_output_path, 'r') as fil: 88 | m = hashlib.md5() 89 | m.update(fil.read()) 90 | md5_digest = m.hexdigest() 91 | new_output_path = self.output_dir + os.sep + md5_digest + ".apk" 92 | if os.path.isfile(new_output_path): 93 | os.remove(new_output_path) 94 | os.rename(self.temp_output_path, new_output_path) 95 | print("%s: %s.apk is completed." % (self.getName(), md5_digest)) 96 | 97 | def update_database(self, result=1): 98 | update_database_lock.acquire() 99 | try: 100 | connection = sqlite3.connect(self.database_filepath) 101 | cursor = connection.cursor() 102 | cursor.execute('update apps set downloaded = ? where url = ?', 103 | (result, self.url,)) 104 | connection.commit() 105 | except sqlite3.OperationalError: 106 | print("%s: Operational Error" % (self.getName())) 107 | finally: 108 | connection.close() 109 | update_database_lock.release() 110 | 111 | class Monitor(threading.Thread): 112 | def __init__(self, threads): 113 | threading.Thread.__init__(self) 114 | self.threads = threads 115 | self.exit_event = threading.Event() 116 | def exit(self): 117 | self.exit_event.set() 118 | self.join() 119 | def run(self): 120 | while not self.exit_event.isSet(): 121 | for t in self.threads: 122 | if t.report() == 0: 123 | print(" new"), 124 | else: 125 | print("%3.0f%%" % (t.report()*100)), 126 | print("") 127 | time.sleep(1) 128 | 129 | def get_undownloaded_url(database_filepath): 130 | undownloaded_urls = [] 131 | try: 132 | connection = sqlite3.connect(database_filepath) 133 | cursor = connection.cursor() 134 | sql = "select * from apps where downloaded = 0" 135 | cursor.execute(sql) 136 | records = cursor.fetchall() 137 | undownloaded_urls = [r[1] for r in records] 138 | except sqlite3.OperationalError: 139 | print("get_undownloaded_url(): Operational Error.") 140 | finally: 141 | connection.close() 142 | return undownloaded_urls 143 | 144 | def fill_work_queue(work_queue, undownloaded_urls): 145 | for u in undownloaded_urls: 146 | work_queue.put(u) 147 | 148 | def import_work(work_queue, database_filepath): 149 | undownloaded_urls = get_undownloaded_url(database_filepath) 150 | fill_work_queue(work_queue, undownloaded_urls) 151 | return len(undownloaded_urls) 152 | 153 | class Watcher: 154 | """this class solves two problems with multithreaded 155 | programs in Python, (1) a signal might be delivered 156 | to any thread (which is just a malfeature) and (2) if 157 | the thread that gets the signal is waiting, the signal 158 | is ignored (which is a bug). 159 | 160 | The watcher is a concurrent process (not thread) that 161 | waits for a signal and the process that contains the 162 | threads. See Appendix A of The Little Book of Semaphores. 163 | http://greenteapress.com/semaphores/ 164 | 165 | I have only tested this on Linux. I would expect it to 166 | work on the Macintosh and not work on Windows. 167 | 168 | Refer to: http://code.activestate.com/recipes/496735-workaround-for-missed-sigint-in-multithreaded-prog/ 169 | """ 170 | 171 | def __init__(self): 172 | """ Creates a child thread, which returns. The parent 173 | thread waits for a KeyboardInterrupt and then kills 174 | the child thread. 175 | """ 176 | self.child = os.fork() 177 | if self.child == 0: 178 | return 179 | else: 180 | self.watch() 181 | 182 | def watch(self): 183 | try: 184 | os.wait() 185 | except KeyboardInterrupt: 186 | # I put the capital B in KeyBoardInterrupt so I can 187 | # tell when the Watcher gets the SIGINT 188 | print("KeyBoardInterrupt") 189 | self.kill() 190 | sys.exit() 191 | 192 | def kill(self): 193 | try: 194 | os.kill(self.child, signal.SIGKILL) 195 | except OSError: pass 196 | 197 | def main(): 198 | if len(sys.argv) < 2: 199 | print("Usage: %s " % (sys.argv[0])) 200 | sys.exit(1) 201 | else: 202 | database_filepath = sys.argv[1] 203 | output_dir = sys.argv[2] 204 | 205 | if not os.path.exists(output_dir): 206 | os.makedirs(output_dir) 207 | temp_dir = output_dir + os.sep + "temp" 208 | if not os.path.exists(temp_dir): 209 | os.makedirs(temp_dir) 210 | Watcher() 211 | threads = [] 212 | work_queue = Queue.Queue() 213 | for i in range(NUM_THREAD): 214 | t = Downloader(work_queue, output_dir, database_filepath) 215 | t.daemon = True 216 | t.start() 217 | threads.append(t) 218 | monitor_thread = Monitor(threads) 219 | monitor_thread.daemon = True 220 | monitor_thread.start() 221 | 222 | exit_flag = 0 223 | while exit_flag < 2: 224 | import_work(work_queue, database_filepath) 225 | if work_queue.empty(): 226 | exit_flag += 1 227 | else: 228 | exit_flag = 0 229 | while not work_queue.empty(): 230 | time.sleep(10) 231 | for t in threads: 232 | t.exit() 233 | monitor_thread.exit() 234 | 235 | if __name__ == '__main__': 236 | main() 237 | -------------------------------------------------------------------------------- /repo/apps/README.md: -------------------------------------------------------------------------------- 1 | Applictions will be downloaded in this directory by default. 2 | -------------------------------------------------------------------------------- /repo/databases/README.md: -------------------------------------------------------------------------------- 1 | SQLite database files will be stored in this directory. 2 | --------------------------------------------------------------------------------