├── .gitignore
├── README.md
├── crawler
    ├── android_apps_crawler
    │   ├── __init__.py
    │   ├── custom_parser.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── android_apps_spider.py
    ├── crawl.sh
    └── scrapy.cfg
├── downloader
    └── downloader.py
└── repo
    ├── apps
        └── README.md
    └── databases
        └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .scrapy
3 | repo/databases
4 | repo/apps
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Android Apps Crawler
 2 | ====================
 3 | 
 4 | Overview
 5 | --------
 6 | Android Apps Crawler is an extensible crawler for downloading Android applications in the third-party markets.
 7 | It can crawl the download url addresses of applications and automatically download applications
 8 | into repository.
 9 | 
10 | Requirements
11 | ------------
12 | * Python 2.6 or up
13 | * Scrapy 0.22 or up: http://scrapy.org (didn't have a full test in lower version.)
14 | * Works on Linux, Windows, Mac OSX, BSD
15 | * Currently, downloader cannot work on Windows.
16 | * For Ubuntu users, "Don't use the python-scrapy package provided by Ubuntu, 
17 | they are typically too old and slow to catch up with latest Scrapy. 
18 | Instead, use the official [Ubuntu Packages](http://doc.scrapy.org/en/latest/topics/ubuntu.html#topics-ubuntu)."
19 | 
20 | Usage
21 | -----
22 | * Set the third-party markets you want to crawl in settings.py
23 | * Set the proxy if you have
24 | * Start crawler: 
25 | ```
26 | ./crawl.sh <market name>
27 | ```
28 | * Start downloader:
29 | ```
30 | ./downloader.py <database file path> <output directory>
31 | ```
32 | 
33 | Settings
34 | --------
35 | You can set proxy, user-agen, database name, etc in ```crawler/android_apps_crawler/settings.py``` file.
36 | 
37 | Supported Third-party Markets (market names used in crawl.sh)
38 | -----------------------------
39 | * AppChina: http://www.appchina.com (appchina.com)
40 | * Hiapk: http://apk.hiapk.com (hiapk.com)
41 | * Anzhi: http://www.anzhi.com (anzhi.com)
42 | * android.d.cn: http://android.d.cn (android.d.cn)
43 | * mumayi: http://www.mumayi.com (mumayi.com)
44 | * gfan: http://apk.gfan.com (gfan.com)
45 | * nduoa: http://www.nduoa.com (nduoa.com)
46 | * 3gyu: http://www.3gyu.com (3gyu.com)
47 | * angeeks: http://apk.angeeks.com (angeeks.com)
48 | * appfun: http://www.appfun.cn (appfun.cn)
49 | * jimi168: http://www.jimi168.com (jimi168.com)
50 | * Keep adding...
51 | 
52 | More Android Markets
53 | --------------------
54 | See: https://github.com/mssun/android-markets-list
55 | 
56 | TODO
57 | ----
58 | * Windows support for downloader.
59 | * Crawl apps from shared cloud storage link (e.g, pan.baidu.com, dbank.com).
60 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mssun/android-apps-crawler/77a88c31e93fcf50fc24e01f27911622214fc820/crawler/android_apps_crawler/__init__.py


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/custom_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrapy.selector import Selector
 4 | from android_apps_crawler.items import AppItem
 5 | 
 6 | def parse_anzhi(response):
 7 |     xpath = "//div[@class='detail_down']/a/@onclick"
 8 |     appItemList = []
 9 |     sel = Selector(response)
10 |     for script in sel.xpath(xpath).extract():
11 |         id = re.search(r"\d+", script).group()
12 |         url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
13 |         appItem = AppItem()
14 |         appItem['url'] = url
15 |         appItemList.append(appItem)
16 |     return appItemList
17 | 
18 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/items.py:
--------------------------------------------------------------------------------
1 | from scrapy.item import Item, Field
2 | 
3 | class AppItem(Item):
4 |     url = Field()
5 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/middlewares.py:
--------------------------------------------------------------------------------
 1 | from scrapy.exceptions import IgnoreRequest
 2 | 
 3 | class DownloaderMiddleware(object):
 4 |     def process_request(self, request, spider):
 5 |         if (spider.settings['PROXIES']):
 6 |             request.meta["proxy"] = spider.settings['PROXIES']['http']
 7 |         if request.url[-3:].lower() in ["apk", "png", "jpg", "exe", "doc",
 8 |                 "zip", "rar"]:
 9 |             print "Ignore request!"
10 |             raise IgnoreRequest
11 | 
12 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | from scrapy import signals
 2 | from scrapy.xlib.pydispatch import dispatcher
 3 | from scrapy import log
 4 | 
 5 | import sqlite3
 6 | from os import path
 7 | from android_apps_crawler import settings
 8 | 
 9 | 
10 | class AppPipeline(object):
11 |     def process_item(self, item, spider):
12 |         log.msg("Catch an AppItem", level=log.INFO)
13 |         return item
14 | 
15 | class SQLitePipeline(object):
16 |     filename = ''
17 |     conn = None
18 |     def __init__(self):
19 |         self.filename += settings.MARKET_NAME
20 |         self.filename += ".db"
21 |         self.filename = path.join(settings.DATABASE_DIR, self.filename)
22 |         print self.filename
23 |         self.conn = None
24 |         dispatcher.connect(self.initialize, signals.engine_started)
25 |         dispatcher.connect(self.finalize, signals.engine_stopped)
26 | 
27 |     def process_item(self, item, spider):
28 |         try:
29 |             self.conn.execute('insert into apps(url) values(?)',
30 |                         (item['url'],)
31 |                     )
32 |             self.conn.commit()
33 |             log.msg("Inserting into database");
34 |         except sqlite3.IntegrityError:
35 |             print "Duplicated"
36 |         return item
37 | 
38 |     def initialize(self):
39 |         if path.exists(self.filename):
40 |             self.conn = sqlite3.connect(self.filename)
41 |         else:
42 |             self.create_table()
43 |         self.conn.execute("PRAGMA journal_mode=WAL;")
44 |         self.conn.commit()
45 | 
46 |     def finalize(self):
47 |         if self.conn is not None:
48 |             self.conn.commit()
49 |             self.conn.close()
50 |             self.conn = None
51 | 
52 |     def create_table(self):
53 |         self.conn = sqlite3.connect(self.filename)
54 |         self.conn.execute("create table apps( \
55 |                 id integer primary key autoincrement, \
56 |                 url varchar(100) not null unique, \
57 |                 downloaded int default 0)"
58 |             )
59 |         self.conn.commit()
60 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for android_apps_crawler project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'android_apps_crawler'
10 | 
11 | SPIDER_MODULES = ['android_apps_crawler.spiders']
12 | NEWSPIDER_MODULE = 'android_apps_crawler.spiders'
13 | USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"
14 | ITEM_PIPELINES = {
15 |     'android_apps_crawler.pipelines.AppPipeline': 1,
16 |     'android_apps_crawler.pipelines.SQLitePipeline': 2,
17 | }
18 | LOG_LEVEL = 'INFO'
19 | DOWNLOADER_MIDDLEWARES = {
20 |     'android_apps_crawler.middlewares.DownloaderMiddleware': 1,
21 | }
22 | 
23 | # Uncomment following statement to use proxy.
24 | # PROXIES = {
25 | #     'http' : '',
26 | # }
27 | 
28 | ALLOWED_DOMAINS = {
29 |     "appchina.com" : ["appchina.com",],
30 |     "hiapk.com"    : ["hiapk.com",],
31 |     "anzhi.com"    : ["anzhi.com",],
32 |     "android.d.cn" : ["android.d.cn",],
33 |     "mumayi.com"   : ["mumayi.com",],
34 |     "gfan.com"     : ["apk.gfan.com",],
35 |     "nduoa.com"    : ["nduoa.com",],
36 |     "3gyu.com"     : ["3gyu.com",],
37 |     "angeeks.com"  : ["angeeks.com",],
38 |     "appfun.cn"    : ["appfun.cn",],
39 |     "jimi168.com"  : ["jimi168.com",],
40 |     "7723.com"     : ["7723.com",],
41 |     "777ccc.com"   : ["777ccc.com",],
42 |     "anruan.com"   : ["anruan.com",],
43 | }
44 | START_URLS = {
45 |     "appchina.com" : ["http://www.appchina.com",],
46 |     "hiapk.com"    : ["http://apk.hiapk.com",],
47 |     "anzhi.com"    : ["http://www.anzhi.com",],
48 |     "android.d.cn" : ["http://android.d.cn",],
49 |     "mumayi.com"   : ["http://www.mumayi.com",],
50 |     "gfan.com"     : ["http://apk.gfan.com",],
51 |     "nduoa.com"    : ["http://www.nduoa.com",],
52 |     "3gyu.com"     : ["http://www.3gyu.com",],
53 |     "angeeks.com"  : ["http://www.angeeks.com",],
54 |     "appfun.cn"    : ["http://www.appfun.cn",],
55 |     "jimi168.com"  : ["http://www.jimi168.com/",],
56 |     "7723.com"     : ["http://www.7723.com",],
57 |     "777ccc.com"   : ["http://www.777ccc.com",],
58 |     "anruan.com"   : ["http://www.anruan.com",],
59 | }
60 | SCRAPE_RULES = {
61 |     "xpath" : {
62 |         "appchina"     : "//a[@class='download-pc fl']/@href",
63 |         "hiapk"        : "//a[@class='link_btn']/@href",
64 |         "android.d.cn" : "//a[@class='localDownload']/@href",
65 |         "mumayi"       : "//a[@class='download fl']/@href",
66 |         "gfan"         : "//a[@id='computerLoad']/@href",
67 |         "nduoa"        : "//a[@class='d_pc_normal']/@href",
68 |         "3gyu"         : "//a[@class='ldownload']/@href",
69 |         "angeeks"      : "//div[@class='rgmainsrimg'][1]/a/@href",
70 |         "appfun"       : "//a[@class='downcp']/@href",
71 |         "jimi168"      : "//a[@class='a_sign2']/@href",
72 |         "7723"         : "//ul[@class='download_list']/li/h5/a/@href",
73 |         "777ccc"       : "//a[@class='downtopc']/@href",
74 |         "anruan"       : "//a[@class='ldownload']/@href",
75 |     },
76 |     "custom_parser" : {
77 |         "anzhi" : "parse_anzhi",
78 |     },
79 | }
80 | DATABASE_DIR = "../repo/databases/"
81 | MARKET_NAME = ""
82 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # To create the first spider for your project use this command:
4 | #
5 | #   scrapy genspider myspider myspider-domain.com
6 | #
7 | # For more info see:
8 | # http://doc.scrapy.org/topics/spiders.html
9 | 


--------------------------------------------------------------------------------
/crawler/android_apps_crawler/spiders/android_apps_spider.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from scrapy.spider import Spider
  4 | from scrapy.selector import Selector
  5 | from scrapy.http import Request
  6 | from scrapy.http import HtmlResponse
  7 | from scrapy import log
  8 | 
  9 | from urlparse import urlparse
 10 | from urlparse import urljoin
 11 | 
 12 | from android_apps_crawler.items import AppItem
 13 | from android_apps_crawler import settings
 14 | from android_apps_crawler import custom_parser
 15 | 
 16 | 
 17 | class AndroidAppsSpider(Spider):
 18 |     name = "android_apps_spider"
 19 |     scrape_rules = settings.SCRAPE_RULES
 20 | 
 21 |     def __init__(self, market=None, database_dir="../repo/databases/", *args, **kwargs):
 22 |         super(AndroidAppsSpider, self).__init__(*args, **kwargs)
 23 |         self.allowed_domains = settings.ALLOWED_DOMAINS[market]
 24 |         self.start_urls = settings.START_URLS[market]
 25 |         settings.MARKET_NAME = market
 26 |         settings.DATABASE_DIR = database_dir
 27 | 
 28 |     def parse(self, response):
 29 |         response_domain = urlparse(response.url).netloc
 30 |         appItemList = []
 31 |         cookie = {}
 32 |         xpath_rule = self.scrape_rules['xpath']
 33 |         for key in xpath_rule.keys():
 34 |             if key in response_domain:
 35 |                 appItemList.extend(
 36 |                         self.parse_xpath(response, xpath_rule[key]))
 37 |                 break
 38 |         custom_parser_rule = self.scrape_rules['custom_parser']
 39 |         for key in custom_parser_rule.keys():
 40 |             if key in response_domain:
 41 |                 appItemList.extend(
 42 |                         getattr(custom_parser, custom_parser_rule[key])(response))
 43 |                 break
 44 |         #if "appchina" in response_domain:
 45 |         #    xpath = "//a[@id='pc-download' and @class='free']/@href"
 46 |         #    appItemList.extend(self.parse_xpath(response, xpath))
 47 |         #elif "hiapk" in response_domain:
 48 |         #    xpath = "//a[@class='linkbtn d1']/@href"
 49 |         #    appItemList.extend(self.parse_xpath(response, xpath))
 50 |         #elif "android.d.cn" in response_domain:
 51 |         #    xpath = "//a[@class='down']/@href"
 52 |         #    appItemList.extend(self.parse_xpath(response, xpath))
 53 |         #elif "anzhi" in response_domain:
 54 |         #    xpath = "//div[@id='btn']/a/@onclick"
 55 |         #    appItemList.extend(self.parse_anzhi(response, xpath))
 56 |         #else:
 57 |         #    pass
 58 |         sel = Selector(response)
 59 |         for url in sel.xpath('//a/@href').extract():
 60 |             url = urljoin(response.url, url)
 61 |             yield Request(url, meta=cookie, callback=self.parse)
 62 | 
 63 |         for item in appItemList:
 64 |             yield item
 65 | 
 66 | 
 67 |     #def parse_appchina(self, response):
 68 |     #    appItemList = []
 69 |     #    hxs = HtmlXPathSelector(response)
 70 |     #    for url in hxs.select(
 71 |     #        "//a[@id='pc-download' and @class='free']/@href"
 72 |     #        ).extract():
 73 |     #        url = urljoin(response.url, url)
 74 |     #        log.msg("Catch an application: %s" % url, level=log.INFO)
 75 |     #        appItem = AppItem()
 76 |     #        appItem['url'] = url
 77 |     #        appItemList.append(appItem)
 78 |     #    return appItemList
 79 | 
 80 |     def parse_xpath(self, response, xpath):
 81 |         appItemList = []
 82 |         sel = Selector(response)
 83 |         for url in sel.xpath(xpath).extract():
 84 |             url = urljoin(response.url, url)
 85 |             log.msg("Catch an application: %s" % url, level=log.INFO)
 86 |             appItem = AppItem()
 87 |             appItem['url'] = url
 88 |             appItemList.append(appItem)
 89 |         return appItemList
 90 | 
 91 |     #def parse_anzhi(self, response, xpath):
 92 |     #    appItemList = []
 93 |     #    hxs = HtmlXPathSelector(response)
 94 |     #    for script in hxs.select(xpath).extract():
 95 |     #        id = re.search(r"\d+", script).group()
 96 |     #        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
 97 |     #        appItem = AppItem()
 98 |     #        appItem['url'] = url
 99 |     #        appItemList.append(appItem)
100 |     #    return appItemList
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/crawler/crawl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ $# -le 0 ]
 3 | then
 4 |     echo "Usage: $0 <market name> [database output directory]"
 5 |     echo "   market name:"
 6 |     echo "       appchina.com"
 7 |     echo "       hiapk.com"
 8 |     echo "       anzhi.com"
 9 |     echo "       android.d.cn"
10 |     echo "       mumayi.com "
11 |     echo "       gfan.com"
12 |     echo "       nduoa.com"
13 |     echo "       3gyu.com"
14 |     echo "       angeeks.com"
15 |     echo "       appfun.cn"
16 |     echo "       jimi168.com"
17 |     echo "   database output directory:"
18 |     echo "       default: ../repo/databases/"
19 |     exit 2
20 | fi
21 | 
22 | if [ $# -eq 1 ]
23 | then
24 |     scrapy crawl android_apps_spider -s JOBDIR=job_$1 -a market=$1
25 | else
26 |     scrapy crawl android_apps_spider -s JOBDIR=job_$1 -a market=$1 -a database_dir=$2
27 | fi
28 | 


--------------------------------------------------------------------------------
/crawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = android_apps_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = android_apps_crawler
12 | 


--------------------------------------------------------------------------------
/downloader/downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | import sys
  4 | import threading
  5 | import sqlite3
  6 | import time
  7 | import urllib2
  8 | import os
  9 | import hashlib
 10 | import signal
 11 | import Queue
 12 | 
 13 | NUM_THREAD = 20
 14 | work_queue_lock = threading.Lock()
 15 | update_database_lock = threading.Lock()
 16 | 
 17 | class Downloader(threading.Thread):
 18 |     def __init__(self, work_queue, output_dir, database_filepath):
 19 |         threading.Thread.__init__(self)
 20 |         self.exit_event = threading.Event()
 21 |         self.work_queue = work_queue
 22 |         self.proxies = None
 23 |         #Define proxy below
 24 |         #self.proxies = {"http": ""}
 25 |         self.output_dir = output_dir
 26 |         self.current_file_size = 0
 27 |         self.file_size = 0
 28 |         self.database_filepath = database_filepath
 29 | 
 30 |     def exit(self):
 31 |         print("%s: asked to exit." % self.getName())
 32 |         self.exit_event.set()
 33 |         self.join()
 34 |         return self.report()
 35 | 
 36 |     def report(self):
 37 |         if self.file_size == 0:
 38 |             return 0
 39 |         return float(self.current_file_size) / self.file_size
 40 | 
 41 |     def run(self):
 42 |         while not self.exit_event.isSet():
 43 |             work_queue_lock.acquire()
 44 |             if not self.work_queue.empty():
 45 |                 self.url = self.work_queue.get()
 46 |                 work_queue_lock.release()
 47 |                 try:
 48 |                     self.download()
 49 |                     self.save()
 50 |                     self.update_database()
 51 |                 except urllib2.HTTPError:
 52 |                     self.update_database(-1)
 53 |             else:
 54 |                 work_queue_lock.release()
 55 |         print("%s: received exit event." % self.getName())
 56 | 
 57 |     def download(self):
 58 |         print("%s: downloading %s" % (self.getName(), self.url))
 59 |         self.current_file_size = 0
 60 |         self.file_size = 0
 61 |         proxy_handler = urllib2.ProxyHandler()
 62 |         if (self.proxies):
 63 |             proxy_handler = urllib2.proxyHandler(self.proxies);
 64 |         opener = urllib2.build_opener(proxy_handler)
 65 |         opener.addheaders = [
 66 |             ('User-Agent', r"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 "
 67 |                 "(KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"),
 68 |             ('Referer', self.url)
 69 |         ]
 70 |         urllib2.install_opener(opener)
 71 |         opening = urllib2.urlopen(self.url)
 72 |         meta = opening.info()
 73 |         self.file_size = int(meta.getheaders("Content-Length")[0])
 74 |         temp_file_name = "%d.apk" % (time.time() * 1000000)
 75 |         temp_dir = self.output_dir + os.sep + "temp"
 76 |         self.temp_output_path = temp_dir + os.sep + temp_file_name
 77 |         with open(self.temp_output_path, 'wb') as fil:
 78 |             block_size = 10240
 79 |             while True:
 80 |                 buf = opening.read(block_size)
 81 |                 self.current_file_size += len(buf)
 82 |                 fil.write(buf)
 83 |                 if not buf:
 84 |                     break
 85 | 
 86 |     def save(self):
 87 |         with open(self.temp_output_path, 'r') as fil:
 88 |             m = hashlib.md5()
 89 |             m.update(fil.read())
 90 |             md5_digest = m.hexdigest()
 91 |         new_output_path = self.output_dir + os.sep + md5_digest + ".apk"
 92 |         if os.path.isfile(new_output_path):
 93 |             os.remove(new_output_path)
 94 |         os.rename(self.temp_output_path, new_output_path)
 95 |         print("%s: %s.apk is completed." % (self.getName(), md5_digest))
 96 | 
 97 |     def update_database(self, result=1):
 98 |         update_database_lock.acquire()
 99 |         try:
100 |             connection = sqlite3.connect(self.database_filepath)
101 |             cursor = connection.cursor()
102 |             cursor.execute('update apps set downloaded = ? where url = ?',
103 |                     (result, self.url,))
104 |             connection.commit()
105 |         except sqlite3.OperationalError:
106 |             print("%s: Operational Error" % (self.getName()))
107 |         finally:
108 |             connection.close()
109 |             update_database_lock.release()
110 | 
111 | class Monitor(threading.Thread):
112 |     def __init__(self, threads):
113 |         threading.Thread.__init__(self)
114 |         self.threads = threads
115 |         self.exit_event = threading.Event()
116 |     def exit(self):
117 |         self.exit_event.set()
118 |         self.join()
119 |     def run(self):
120 |         while not self.exit_event.isSet():
121 |             for t in self.threads:
122 |                 if t.report() == 0:
123 |                     print(" new"),
124 |                 else:
125 |                     print("%3.0f%%" % (t.report()*100)),
126 |             print("")
127 |             time.sleep(1)
128 | 
129 | def get_undownloaded_url(database_filepath):
130 |     undownloaded_urls = []
131 |     try:
132 |         connection = sqlite3.connect(database_filepath)
133 |         cursor = connection.cursor()
134 |         sql = "select * from apps where downloaded = 0"
135 |         cursor.execute(sql)
136 |         records = cursor.fetchall()
137 |         undownloaded_urls = [r[1] for r in records]
138 |     except sqlite3.OperationalError:
139 |         print("get_undownloaded_url(): Operational Error.")
140 |     finally:
141 |         connection.close()
142 |     return undownloaded_urls
143 | 
144 | def fill_work_queue(work_queue, undownloaded_urls):
145 |     for u in undownloaded_urls:
146 |         work_queue.put(u)
147 | 
148 | def import_work(work_queue, database_filepath):
149 |     undownloaded_urls = get_undownloaded_url(database_filepath)
150 |     fill_work_queue(work_queue, undownloaded_urls)
151 |     return len(undownloaded_urls)
152 | 
153 | class Watcher:
154 |     """this class solves two problems with multithreaded
155 |     programs in Python, (1) a signal might be delivered
156 |     to any thread (which is just a malfeature) and (2) if
157 |     the thread that gets the signal is waiting, the signal
158 |     is ignored (which is a bug).
159 | 
160 |     The watcher is a concurrent process (not thread) that
161 |     waits for a signal and the process that contains the
162 |     threads.  See Appendix A of The Little Book of Semaphores.
163 |     http://greenteapress.com/semaphores/
164 | 
165 |     I have only tested this on Linux.  I would expect it to
166 |     work on the Macintosh and not work on Windows.
167 | 
168 |     Refer to: http://code.activestate.com/recipes/496735-workaround-for-missed-sigint-in-multithreaded-prog/
169 |     """
170 | 
171 |     def __init__(self):
172 |         """ Creates a child thread, which returns.  The parent
173 |             thread waits for a KeyboardInterrupt and then kills
174 |             the child thread.
175 |         """
176 |         self.child = os.fork()
177 |         if self.child == 0:
178 |             return
179 |         else:
180 |             self.watch()
181 | 
182 |     def watch(self):
183 |         try:
184 |             os.wait()
185 |         except KeyboardInterrupt:
186 |             # I put the capital B in KeyBoardInterrupt so I can
187 |             # tell when the Watcher gets the SIGINT
188 |             print("KeyBoardInterrupt")
189 |             self.kill()
190 |         sys.exit()
191 | 
192 |     def kill(self):
193 |         try:
194 |             os.kill(self.child, signal.SIGKILL)
195 |         except OSError: pass
196 | 
197 | def main():
198 |     if len(sys.argv) < 2:
199 |         print("Usage: %s <SQLite database> <output directory>" % (sys.argv[0]))
200 |         sys.exit(1)
201 |     else:
202 |         database_filepath = sys.argv[1]
203 |         output_dir = sys.argv[2]
204 | 
205 |     if not os.path.exists(output_dir):
206 |         os.makedirs(output_dir)
207 |     temp_dir = output_dir + os.sep + "temp"
208 |     if not os.path.exists(temp_dir):
209 |         os.makedirs(temp_dir)
210 |     Watcher()
211 |     threads = []
212 |     work_queue = Queue.Queue()
213 |     for i in range(NUM_THREAD):
214 |         t = Downloader(work_queue, output_dir, database_filepath)
215 |         t.daemon = True
216 |         t.start()
217 |         threads.append(t)
218 |     monitor_thread = Monitor(threads)
219 |     monitor_thread.daemon = True
220 |     monitor_thread.start()
221 | 
222 |     exit_flag = 0
223 |     while exit_flag < 2:
224 |         import_work(work_queue, database_filepath)
225 |         if work_queue.empty():
226 |             exit_flag += 1
227 |         else:
228 |             exit_flag = 0
229 |         while not work_queue.empty():
230 |             time.sleep(10)
231 |     for t in threads:
232 |         t.exit()
233 |     monitor_thread.exit()
234 | 
235 | if __name__ == '__main__':
236 |     main()
237 | 


--------------------------------------------------------------------------------
/repo/apps/README.md:
--------------------------------------------------------------------------------
1 | Applictions will be downloaded in this directory by default.
2 | 


--------------------------------------------------------------------------------
/repo/databases/README.md:
--------------------------------------------------------------------------------
1 | SQLite database files will be stored in this directory.
2 | 


--------------------------------------------------------------------------------