├── bitcointalk_ANN ├── __init__.py ├── spiders │ ├── old_files │ │ ├── merge.py │ │ ├── get_urls.py │ │ ├── ANN_info.py │ │ ├── urls_spider.py │ │ ├── spider_bitcointalk.py │ │ ├── tests_spider.py │ │ ├── spider_base_url.py │ │ ├── ANN_runfile.py │ │ ├── runfile.py │ │ ├── posts_spider.py │ │ ├── bitcointalk_spider.py │ │ ├── bitcointalk_spider_test.py │ │ ├── add_css.py │ │ └── urls.py │ ├── urls.pickle │ ├── __init__.py │ └── bitcointalk_spider.py ├── proxy_list.txt ├── items.py ├── pipelines.py ├── middlewares.py ├── helper.py ├── settings.py ├── .idea │ └── workspace.xml ├── style.html └── pages │ └── 7.html ├── .idea ├── vcs.xml ├── dictionaries │ └── Shasa.xml ├── misc.xml ├── modules.xml ├── bitcointalk_ANN.iml └── workspace.xml ├── scrapy.cfg ├── runfile.py └── README.md /bitcointalk_ANN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/merge.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/get_urls.py: -------------------------------------------------------------------------------- 1 | BASE_URL = r'https://bitcointalk.org/index.php?topic=421615.0' -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/urls.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shasafoster/bitcointalk-ANN/HEAD/bitcointalk_ANN/spiders/urls.pickle -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.idea/dictionaries/Shasa.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | pinkcoin 5 | shasa 6 | 7 | 8 | -------------------------------------------------------------------------------- /bitcointalk_ANN/proxy_list.txt: -------------------------------------------------------------------------------- 1 | http://5.196.189.50:8080 2 | http://54.36.182.96:3128 3 | http://89.236.17.106:3128 4 | http://163.172.217.103:31288 5 | http://203.74.4.7:80 6 | http://203.74.4.6:80 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/ANN_info.py: -------------------------------------------------------------------------------- 1 | # This file stores information that may often change 2 | 3 | BASE_URL = r'https://bitcointalk.org/index.php?topic=421615.0' 4 | CRYPTO_NAME = r'Dash' 5 | BASE_DIR = r'C:/Users/Shasa/Documents/Projects/bitcointalk_ANN/' -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bitcointalk_ANN.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bitcointalk_ANN 12 | -------------------------------------------------------------------------------- /bitcointalk_ANN/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BitcointalkAnnItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | 17 | class PostsItem(scrapy.Item): 18 | page_number = scrapy.Field() 19 | posts = scrapy.Field() 20 | -------------------------------------------------------------------------------- /.idea/bitcointalk_ANN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /runfile.py: -------------------------------------------------------------------------------- 1 | from scrapy.crawler import CrawlerProcess 2 | from scrapy.utils.project import get_project_settings 3 | from bitcointalk_ANN.helper import * 4 | 5 | 6 | def script(): 7 | num_of_thread_pages, crypto_currency = get_urls() 8 | 9 | process = CrawlerProcess(get_project_settings()) 10 | process.crawl('bitcointalk') 11 | process.start() # the script will block here until the crawling is finished 12 | 13 | num_of_scraped_pages = merge(crypto_currency) 14 | 15 | print_log(crypto_currency, num_of_thread_pages, num_of_scraped_pages) 16 | 17 | 18 | script() 19 | 20 | -------------------------------------------------------------------------------- /bitcointalk_ANN/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | class PostPipeline(object): 12 | 13 | def process_item(self, item, spider): 14 | 15 | filename = r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/pages' \ 16 | + '/' + str(item['page_number']) + '.html' 17 | with open(filename, 'wb') as f: 18 | soup = BeautifulSoup(item['posts']) 19 | f.write(soup.prettify(encoding='utf-8')) 20 | print('Saving page ' + str(item['page_number'])) 21 | f.close() 22 | 23 | 24 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/urls_spider.py: -------------------------------------------------------------------------------- 1 | from ANN_info import BASE_URL 2 | import scrapy 3 | import pickle 4 | 5 | 6 | class UrlsSpider(scrapy.Spider): 7 | name = "urls" 8 | 9 | def start_requests(self): 10 | 11 | yield scrapy.Request(url=BASE_URL, callback=self.parse) 12 | 13 | def parse(self, response): 14 | 15 | # Extract the number of pages in the bitcointalk thread 16 | table = response.xpath('//div[@id="bodyarea"]/table')[0] 17 | num_pages = max([int(x) for x in table.xpath('./tr/td/a/text()').extract()]) 18 | 19 | 20 | # Create list of pages in thread for spider to parse and then pickle the list 21 | urls = [BASE_URL] + [BASE_URL[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 22 | urls = urls[:2] 23 | output = open('urls.pkl', 'wb') 24 | pickle.dump(urls, output) 25 | output.close() -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/spider_bitcointalk.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import os 3 | import pickle 4 | 5 | 6 | class BitcointalkSpider(scrapy.Spider): 7 | name = "bitcointalk" 8 | 9 | def start_requests(self): 10 | # Delete text file if exists 11 | try: 12 | path = os.path.join(BASE_DIR, (CRYPTO_NAME + r'.html')) 13 | os.remove(path) 14 | except OSError: 15 | pass 16 | 17 | pkl_file = open('urls.pkl','rb') 18 | urls = pickle.load(pkl_file) 19 | urls = [urls[0]] 20 | pkl_file.close() 21 | 22 | # Parse urls 23 | for url in urls: 24 | yield scrapy.Request(url=url, callback=self.parse) 25 | 26 | def parse(self, response): 27 | 28 | # The posts from the webpage 29 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 30 | posts = table.xpath('./tr') 31 | 32 | 33 | filename = CRYPTO_NAME + '.html' 34 | with open(filename, 'a') as f: 35 | f.write(BeautifulSoup(table.extract(), 'lxml').encode('utf8')) 36 | #for post in posts: 37 | # f.write(BeautifulSoup(post.extract(),'lxml').encode('utf8')) 38 | f.close() 39 | self.log('Saved file %s' % filename) -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/tests_spider.py: -------------------------------------------------------------------------------- 1 | from ANN_info import BASE_URL 2 | from ANN_info import CRYPTO_NAME 3 | from ANN_info import BASE_DIR 4 | import scrapy 5 | from bs4 import BeautifulSoup 6 | import os 7 | import pickle 8 | 9 | 10 | class TestsSpider(scrapy.Spider): 11 | name = "tests" 12 | 13 | def start_requests(self): 14 | # Delete text file if exists 15 | try: 16 | path = os.path.join(BASE_DIR, (CRYPTO_NAME + r'.html')) 17 | os.remove(path) 18 | except OSError: 19 | pass 20 | 21 | pkl_file = open('urls.pkl','rb') 22 | urls = pickle.load(pkl_file) 23 | urls = [urls[0]] 24 | pkl_file.close() 25 | 26 | # Parse urls 27 | for url in urls: 28 | yield scrapy.Request(url=url, callback=self.parse) 29 | 30 | def parse(self, response): 31 | 32 | # The posts from the webpage 33 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 34 | posts = table.xpath('./tr') 35 | 36 | 37 | filename = CRYPTO_NAME + '.html' 38 | with open(filename, 'a') as f: 39 | f.write(BeautifulSoup(table.extract(), 'lxml').encode('utf8')) 40 | #for post in posts: 41 | # f.write(BeautifulSoup(post.extract(),'lxml').encode('utf8')) 42 | f.close() 43 | self.log('Saved file %s' % filename) -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/spider_base_url.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from bs4 import BeautifulSoup 3 | import urllib2 4 | from lxml import etree 5 | import scrapy 6 | from spider_bitcointalk import BitcointalkSpider 7 | from scrapy.crawler import CrawlerProcess 8 | 9 | # Prompt the user for input (via command prompt) 10 | name = raw_input("Enter the name of the crypto economic protocol: ") 11 | 12 | # Get base url from coinmarketcap.com 13 | url = r'https://coinmarketcap.com/currencies/' + name 14 | response = urllib2.urlopen(url) 15 | soup = BeautifulSoup(response, 'lxml') 16 | base_url = soup.find('a', href=True, text='Announcement')['href'] 17 | 18 | # Extract the number of pages in the bitcointalk thread 19 | response = urllib2.urlopen(base_url) 20 | html_parser = etree.HTMLParser() 21 | tree = etree.parse(response, html_parser) 22 | table = tree.xpath('//div[@id="bodyarea"]/table')[0] 23 | num_pages = max([int(x) for x in table.xpath('./tr/td/a/text()')]) 24 | 25 | # Create list of page urls in thread for spider to parse and then pickle the list 26 | urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 27 | urls = urls[:2] 28 | output = open('urls.pkl', 'wb') 29 | pickle.dump(urls, output) 30 | output.close() 31 | 32 | process = CrawlerProcess({ 33 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 34 | }) 35 | 36 | # Run spider 37 | process.crawl(BitcointalkSpider) 38 | process.start() -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/ANN_runfile.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from bs4 import BeautifulSoup 3 | import urllib2 4 | from lxml import etree 5 | import scrapy 6 | from spider_bitcointalk import BitcointalkSpider 7 | from scrapy.crawler import CrawlerProcess 8 | 9 | # Prompt the user for input (via command prompt) 10 | name = raw_input("Enter the name of the crypto economic protocol: ") 11 | 12 | # Get base url from coinmarketcap.com 13 | url = r'https://coinmarketcap.com/currencies/' + name 14 | response = urllib2.urlopen(url) 15 | soup = BeautifulSoup(response, 'lxml') 16 | base_url = soup.find('a', href=True, text='Announcement')['href'] 17 | 18 | # Extract the number of pages in the bitcointalk thread 19 | response = urllib2.urlopen(base_url) 20 | html_parser = etree.HTMLParser() 21 | tree = etree.parse(response, html_parser) 22 | table = tree.xpath('//div[@id="bodyarea"]/table')[0] 23 | num_pages = max([int(x) for x in table.xpath('./tr/td/a/text()')]) 24 | 25 | # Create list of page urls in thread for spider to parse and then pickle the list 26 | name_urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 27 | path = r'C:/Users/Shasa/Documents/Projects/bitcointalk_ANN/bitcointalk_ANN/spiders/name_urls.pkl' 28 | output = open(path, 'wb') 29 | pickle.dump(name_urls, output) 30 | output.close() 31 | 32 | process = CrawlerProcess({ 33 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 34 | }) 35 | 36 | # Run spider 37 | process.crawl(BitcointalkSpider) 38 | process.start() -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/bitcointalk_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import pickle 3 | import re 4 | from collections import Counter 5 | from bitcointalk_ANN.items import PostsItem 6 | 7 | 8 | with open('./bitcointalk_ANN/spiders/urls.pickle', 'rb') as handle: 9 | urls = pickle.load(handle) 10 | 11 | 12 | class BitcointalkSpider(scrapy.Spider): 13 | name = "bitcointalk" 14 | 15 | def start_requests(self): 16 | # Parse urls 17 | for i, url in enumerate(urls): 18 | yield scrapy.Request(url=url, callback=self.parse, meta={'page_number': i}, dont_filter=True) 19 | 20 | def parse(self, response): 21 | 22 | # We only want user posts (no ads, deleted posts etc) 23 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 24 | rows = list(table.xpath('./tr')) 25 | joined = ''.join([str(row) for row in rows]) 26 | results = re.findall(r'', joined) 27 | most_common_result = Counter(results).most_common()[0][0] 28 | most_common_class= re.findall(r'"[\w]+"', most_common_result)[0].replace('"', '') 29 | x_path = './tr[@class="' + most_common_class + '"]' 30 | post_list = table.xpath(x_path) 31 | posts = ''.join([post.extract()for post in post_list]) 32 | 33 | # Create PostsItem item and assign variables 34 | posts_item = PostsItem() 35 | posts_item['page_number'] = response.request.meta['page_number'] 36 | posts_item['posts'] = posts 37 | yield posts_item 38 | 39 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/runfile.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from lxml import etree 3 | from fetch_css import * 4 | import urllib2 5 | from bs4 import BeautifulSoup 6 | import subprocess 7 | 8 | # Prompt the user for input (via command prompt) 9 | crypto_currency = raw_input("Enter the name of the crypto economic protocol: ") 10 | 11 | # Get base url for the bitcointalk [ANN] from coinmarketcap.com 12 | url = r'https://coinmarketcap.com/currencies/' + crypto_currency 13 | response = urllib2.urlopen(url) 14 | soup = BeautifulSoup(response, 'lxml') 15 | base_url = soup.find('a', href=True, text='Announcement')['href'] 16 | 17 | # Extract the number of pages in the bitcointalk thread 18 | response = urllib2.urlopen(base_url) 19 | html_parser = etree.HTMLParser() 20 | tree = etree.parse(response, html_parser) 21 | table = tree.xpath('//div[@id="bodyarea"]/table')[0] 22 | num_pages = max([int(x) for x in table.xpath('./tr/td/a/text()')]) 23 | 24 | # Create list of page urls in thread for spider to parse and then pickle the list 25 | name_urls = [crypto_currency] + [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 26 | path = r'C:\Users\Shasa\Documents\Projects\bitcointalk_ANN\bitcointalk_ANN\name_urls.pkl' 27 | output = open(path, 'wb') 28 | pickle.dump(name_urls, output) 29 | output.close() 30 | 31 | # Extract the CSS of the bitcointalk webpage and write to file 32 | '---------------------------' 33 | print('Extracting CSS...') 34 | '---------------------------' 35 | write_css(crypto_currency, base_url) 36 | 37 | # python 3.5+ 38 | # subprocess.run(['scrapy crawl bitcointalk']) 39 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/posts_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import os 3 | 4 | 5 | class PostsSpider(scrapy.Spider): 6 | name = "posts" 7 | 8 | def start_requests(self): 9 | try: 10 | path = r"C:\Users\Shasa\Documents\Projects\bitcointalk_ANN\posts.txt" 11 | os.remove(path) 12 | except OSError: 13 | pass 14 | 15 | urls = [ 16 | 'https://bitcointalk.org/index.php?topic=421615.0' 17 | ] 18 | for url in urls: 19 | yield scrapy.Request(url=url, callback=self.parse) 20 | 21 | def parse(self, response): 22 | 23 | filename = 'post.txt' 24 | with open(filename, 'a') as f: 25 | 26 | # get title of ANN thread and write 27 | title = response.xpath('//title/text()').extract_first() 28 | f.write(title.encode('utf8')) 29 | f.write("\n") 30 | 31 | # extract table of posts 32 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 33 | 34 | # Get the info on the posters 35 | poster_info = table.css('.poster_info') 36 | 37 | # Get the info on the post 38 | post_info = table.css('.td_headerandpost n.post') 39 | 40 | for s in poster_info: 41 | username = s.css('a::text').extract()[0] 42 | user_info = s.css('.smalltext').xpath('./text()').re('[ \w . \w ]+')[:3] 43 | 44 | 45 | # write username, rank level, user activity 46 | if any(c.isalpha() for c in username): 47 | f.write(username.encode('utf8')) 48 | f.write(',') 49 | 50 | for ss in user_info: 51 | f.write(ss.encode('utf8')) 52 | f.write(',') 53 | f.write("\n") 54 | 55 | self.log('Saved file %s' % filename) 56 | -------------------------------------------------------------------------------- /bitcointalk_ANN/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BitcointalkAnnSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bitcointalk-ANN 2 | Aim: To scrape a 1000+ bitcointalk [ANN] thread into a single highly readible html document for better reading and analysis 3 | An example of such a thread: https://bitcointalk.org/index.php?topic=421615.20 4 | 5 | ## Introduction 6 | Currently reading the bitcointalk [ANN] thread for a crypto-currency is useful tool for (investment) analysis of crypto-currency. 7 | The issues faces by a reader of a bitcointalk [ANN] are 8 | 1. Their are often 1000+ pages in the [ANN] thread so you have to click the 'next button' 1000+ times 9 | 2.There are ads, user footer/motto's, icons ... that effect readibility 10 | 3. The styling is un-appealing 11 | 12 | ## Timeline 13 | The three issues outlined above outline the timeline of the project. 14 | The first challenge has been addressed and completed. 15 | 16 | ### To Do. 17 | 1. Remove the ads, annoying icons, user footers and mottos from the document 18 | 2. Make the styling attractive and highly readible (think medium.com) 19 | 20 | 21 | ## Install / Use 22 | 23 | #### Install packages 24 | Scrapy: https://scrapy.org/ $ pip install scrapy 25 | BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ $ pip install beautifulsoup4 26 | lxml: http://lxml.de/installation.html $ pip install lxml 27 | 28 | #### Step 1. 29 | Create a new directory (folder) on your computer 30 | 31 | #### Step 2. 32 | Clone the repository into this new directory on your computer 33 | 34 | #### Step 3. 35 | Open the command propmt in this new directory 36 | 37 | #### Step 4. 38 | Enter: 39 | ``` 40 | $python runfile.py 41 | ``` 42 | * *The command prompt will ask you to enter the name of the crpyto-currencies you want to create the [ANN] document for.* 43 | * *This command should take 1-3 seconds to run* 44 | 45 | #### Step 5. 46 | Enter: 47 | ``` 48 | $scrapy crawl bitcointalk 49 | ``` 50 | * *This command will run the spider* 51 | * *This command will take much longer to run (it depends highly on the number of webpages the spider has to parse)* 52 | 53 | #### Step 6. 54 | After the spider has finished running, if there were no errors, an .html document should have been created in the new top level directory that you created. 55 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/bitcointalk_spider.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import os 3 | import urllib2 4 | from lxml import etree 5 | import scrapy 6 | import add_css 7 | 8 | # Prompt the user for input (via command prompt) 9 | crypto_currency = raw_input("Enter the name of the crypto economic protocol: ").lower() 10 | print(r'Parsing ' + r'"https://coinmarketcap.com/currencies/' + crypto_currency + r'"...') 11 | 12 | # Get base url from coinmarketcap.com 13 | url = r'https://coinmarketcap.com/currencies/' + crypto_currency 14 | response = urllib2.urlopen(url) 15 | soup = BeautifulSoup(response, 'lxml') 16 | base_url = soup.find('a', href=True, text='Announcement')['href'] 17 | 18 | # Extract the number of pages in the bitcointalk.com thread 19 | forum_response = urllib2.urlopen(base_url) 20 | html_parser = etree.HTMLParser() 21 | tree = etree.parse(forum_response, html_parser) 22 | table = tree.xpath('//div[@id="bodyarea"]/table')[0] 23 | 24 | num_pages = [] 25 | for x in table.xpath('./tr/td/a/text()'): 26 | try: 27 | num_pages.append(int(x)) 28 | except ValueError: 29 | pass 30 | num_pages = max(num_pages) 31 | urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 32 | urls = urls[:2] 33 | 34 | # Extract the CSS pages 35 | add_css.write_css(crypto_currency, base_url) 36 | 37 | class BitcointalkSpider(scrapy.Spider): 38 | name = "bitcointalk" 39 | 40 | def start_requests(self): 41 | 42 | # Delete html file for the crypto-currency if exists 43 | try: 44 | base = r'C:\Users\Shasa\PycharmProjects\bitcointalk\bitcointalk_ANN' 45 | path = os.path.join(base, (crypto_currency + r'.html')) 46 | os.remove(path) 47 | except OSError: 48 | pass 49 | 50 | # Parse urls 51 | for url in urls: 52 | yield scrapy.Request(url=url, callback=self.parse) 53 | 54 | def parse(self, response): 55 | 56 | # The posts from the webpage 57 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 58 | # posts = table.xpath('./tr') 59 | 60 | filename = crypto_currency + '.html' 61 | with open(filename, 'a') as f: 62 | #f.write(BeautifulSoup(table.extract(), 'lxml').encode('utf8')) 63 | 64 | posts = table.xpath('./tr') 65 | for post in posts: 66 | 67 | # x=list(table.xpath('./tr')) 68 | # x 69 | # re.findall(r'',X) 70 | # f.write(BeautifulSoup(post.extract(),'lxml').encode('utf8')) 71 | f.close() 72 | self.log('Saved file %s' % filename) 73 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/bitcointalk_spider_test.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import os 3 | import urllib2 4 | from lxml import etree 5 | import scrapy 6 | import add_css 7 | import re 8 | from collections import Counter 9 | 10 | # Prompt the user for input (via command prompt) 11 | #crypto_currency = raw_input("Enter the name of the crypto economic protocol: ").lower() 12 | #print(r'Parsing ' + r'"https://coinmarketcap.com/currencies/' + crypto_currency + r'"...') 13 | 14 | # Get base url from coinmarketcap.com 15 | crypto_currency = 'pinkcoin' 16 | 17 | response = urllib2.urlopen(r'https://coinmarketcap.com/currencies/' + crypto_currency) 18 | soup = BeautifulSoup(response, 'lxml') 19 | base_url = soup.find('a', href=True, text='Announcement')['href'] 20 | 21 | # Extract the number of pages in the bitcointalk.com thread 22 | forum_response = urllib2.urlopen(base_url) 23 | html_parser = etree.HTMLParser() 24 | tree = etree.parse(forum_response, html_parser) 25 | index_table = tree.xpath('//div[@id="bodyarea"]/table')[0] 26 | 27 | num_pages = [] 28 | for x in index_table.xpath('./tr/td/a/text()'): 29 | try: 30 | num_pages.append(int(x)) 31 | except ValueError: 32 | pass 33 | num_pages = max(num_pages) 34 | urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 35 | 36 | 37 | class BitcointalkSpider(scrapy.Spider): 38 | name = "bitcointalkTest" 39 | 40 | def start_requests(self): 41 | 42 | # Delete html file for the crypto-currency if exists 43 | try: 44 | base = r'C:\Users\Shasa\PycharmProjects\bitcointalk\bitcointalk_ANN' 45 | path = os.path.join(base, (crypto_currency + r'.html')) 46 | os.remove(path) 47 | except OSError: 48 | pass 49 | 50 | with open(r'./style.html', 'r') as f: 51 | style = f.read() 52 | f.close() 53 | 54 | with open(crypto_currency + '.html', 'a') as f: 55 | f.write(style) 56 | f.close() 57 | 58 | # Parse urls 59 | for i, url in enumerate(urls): 60 | yield scrapy.Request(url=url, meta={'priority': i}, callback=self.parse, ) 61 | 62 | def parse(self, response): 63 | 64 | # We only want user posts (no ads, deleted posts etc) 65 | table = response.xpath('//div[@id="bodyarea"]/form[@id="quickModForm"]/table')[0] 66 | rows = list(table.xpath('./tr')) 67 | joined = ''.join([str(row) for row in rows]) 68 | results = re.findall(r'', joined) 69 | most_common_result = Counter(results).most_common()[0][0] 70 | most_common_class= re.findall(r'"[\w]+"', most_common_result)[0].replace('"', '') 71 | x_path = './tr[@class="' + most_common_class + '"]' 72 | posts = table.xpath(x_path) 73 | 74 | filename = crypto_currency + '.html' 75 | with open(filename, 'a') as f: 76 | for post in posts: 77 | f.write(BeautifulSoup(post.extract(), 'lxml').encode('utf8')) 78 | f.close() 79 | self.log('Saved file %s' % filename) 80 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/add_css.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import urllib2 3 | from urlparse import urlparse 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | def fetch_css(url): 8 | try: 9 | response = urllib2.urlopen(url) 10 | html_data = response.read() 11 | response.close() 12 | 13 | soup = BeautifulSoup(''.join(html_data)) 14 | 15 | # Find all external style sheet references 16 | ext_styles = soup.findAll('link', rel="stylesheet") 17 | 18 | # Find all internal styles 19 | int_styles = soup.findAll('style', type="text/css") 20 | 21 | # TODO: Find styles defined inline? 22 | # Might not be useful... which

is which? 23 | 24 | # Loop through all the found int styles, extract style text, store in text 25 | # first, check to see if there are any results within int_styles. 26 | int_css_data = '' 27 | int_found = 1 28 | if len(int_styles) != 0: 29 | for i in int_styles: 30 | print "Found an internal stylesheet" 31 | int_css_data += i.find(text=True) 32 | else: 33 | int_found = 0 34 | print "No internal stylesheets found" 35 | 36 | # Loop through all the found ext stylesheet, extract the relative URL, 37 | # append the base URL, and fetch all content in that URL 38 | # first, check to see if there are any results within ext_styles. 39 | ext_css_data = '' 40 | ext_found = 1 41 | if len(ext_styles) != 0: 42 | for i in ext_styles: 43 | # Check to see if the href to css style is absolute or relative 44 | o = urlparse(i['href']) 45 | if o.scheme == "": 46 | css_url = url + '/' + i['href'] # added "/" just in case 47 | print "Found external stylesheet: " + css_url 48 | else: 49 | css_url = i['href'] 50 | print "Found external stylesheet: " + css_url 51 | 52 | response = urllib2.urlopen(css_url) 53 | ext_css_data += response.read() 54 | response.close() 55 | else: 56 | ext_found = 0 57 | print "No external stylesheets found" 58 | 59 | # Combine all internal and external styles into one stylesheet (must convert 60 | # string to unicode and ignore errors! 61 | # FIXME: Having problems picking up JP characters: 62 | # html[lang="ja-JP"] select{font-family:"Hiragino Kaku Gothic Pro", "Ã£Ã£Ã¨Â´ Pro W3" 63 | # I already tried ext_css_data.encode('utf-8'), but this didn't work 64 | all_css_data = int_css_data + unicode(ext_css_data, errors='ignore') 65 | 66 | return all_css_data, int_found, ext_found 67 | except: 68 | return "", 0, 0 69 | 70 | 71 | def write_css(name, url): 72 | out, int_found, ext_found = fetch_css(url) 73 | 74 | if ext_found == 1 or int_found == 1: 75 | filename = name + '_css.out' 76 | f = open(filename, 'w') 77 | f.write(out) 78 | print "Styles successfully written to: " + filename + "n" 79 | f.close() 80 | elif out == "": 81 | print "Error: URL not found!" 82 | else: 83 | print "No styles found for " + url + "n" 84 | -------------------------------------------------------------------------------- /bitcointalk_ANN/spiders/old_files/urls.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import urllib2 3 | from lxml import etree 4 | import pickle 5 | import os 6 | import glob 7 | 8 | 9 | def get_urls(): 10 | """ 11 | Extracts the urls of a bitcointalk [ANN] thread to be parsed and downloaded 12 | :return: Name of entered crpyto currency, Urls of pages in bitcointalk [ANN] thread 13 | """ 14 | 15 | # Delete all files in pages directory 16 | files = glob.glob(r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/pages/*') 17 | for f in files: 18 | os.remove(f) 19 | 20 | # Prompt the user for input (via command prompt) 21 | print '' 22 | crypto_currency = raw_input("Enter the name of the crypto economic protocol: ").lower() 23 | print(r'Parsing ' + r'"https://coinmarketcap.com/currencies/' + crypto_currency + r'"...') 24 | print '' 25 | print('') 26 | 27 | response = urllib2.urlopen(r'https://coinmarketcap.com/currencies/' + crypto_currency) 28 | soup = BeautifulSoup(response, 'lxml') 29 | base_url = soup.find('a', href=True, text='Announcement')['href'] 30 | 31 | print('The url of the first bitcointalk [ANN] thread is...') 32 | print(base_url) 33 | print '' 34 | print('') 35 | 36 | # Extract the number of pages in the bitcointalk.com thread 37 | forum_response = urllib2.urlopen(base_url) 38 | html_parser = etree.HTMLParser() 39 | tree = etree.parse(forum_response, html_parser) 40 | index_table = tree.xpath('//div[@id="bodyarea"]/table')[0] 41 | 42 | num_pages = [] 43 | for x in index_table.xpath('./tr/td/a/text()'): 44 | try: 45 | num_pages.append(int(x)) 46 | except ValueError: 47 | pass 48 | num_pages = max(num_pages) 49 | urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_pages + 1)] 50 | 51 | print('This bitcointalk thread has ' + str(num_pages) + ' pages.') 52 | print('') 53 | print('') 54 | 55 | with open('./bitcointalk_ANN/spiders/urls.pickle', 'wb') as handle: 56 | pickle.dump(urls, handle, protocol=pickle.HIGHEST_PROTOCOL) 57 | 58 | return crypto_currency 59 | 60 | 61 | def merge(crypto_currency): 62 | 63 | # Delete html file for the crypto-currency if exists 64 | try: 65 | base = r'C:\Users\Shasa\PycharmProjects\bitcointalk\bitcointalk_ANN' 66 | path = os.path.join(base, (crypto_currency + r'.html')) 67 | os.remove(path) 68 | except OSError: 69 | pass 70 | 71 | # Read in style html 72 | with open(r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/style.html', 'r') as f: 73 | style = f.read() 74 | f.close() 75 | 76 | # Read in all .html files of the posts 77 | page_paths = glob.glob('C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/pages/*.html') 78 | page_paths = sorted(page_paths) 79 | 80 | # Add style html to crypto-currency html doc 81 | with open('C:/Users/Shasa/Dropbox/ANN/' + crypto_currency + '.html', 'a') as main: 82 | main.write('') 83 | main.write(style) 84 | 85 | main.write('') 86 | for path in page_paths: 87 | with open(path, 'r') as f: 88 | content = f.read() 89 | main.write(content) 90 | f.close() 91 | main.write('') 92 | main.write('') 93 | main.close() -------------------------------------------------------------------------------- /bitcointalk_ANN/helper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from urllib.request import urlopen 3 | from lxml import etree 4 | import pickle 5 | import os 6 | import glob 7 | import re 8 | import sys 9 | 10 | 11 | def get_urls(): 12 | """ 13 | Extracts the urls of a bitcointalk [ANN] thread to be parsed and downloaded 14 | :return: Name of entered crpyto currency, Urls of pages in bitcointalk [ANN] thread 15 | """ 16 | 17 | # Delete all files in pages directory 18 | old_pages = glob.glob(r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/pages/*') 19 | for f in old_pages: 20 | os.remove(f) 21 | try: 22 | os.remove(r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/crawl.log') 23 | except WindowsError: 24 | pass 25 | 26 | # Prompt the user for input (via command prompt) 27 | print('') 28 | crypto_currency = input("Enter the name of the crypto:").lower() 29 | choice = input('Enter own bitcointalk URL (Y/N): ') 30 | print('') 31 | 32 | if choice == 'N': 33 | print(r'Parsing ' + r'"https://coinmarketcap.com/currencies/' + crypto_currency + r'"...') 34 | response = urlopen(r'https://coinmarketcap.com/currencies/' + crypto_currency) 35 | soup = BeautifulSoup(response, 'lxml') 36 | try: 37 | base_url = soup.find('a', href=True, text='Announcement')['href'] 38 | except TypeError: 39 | print('') 40 | print('ERROR: Announcement URl not found on coinmarketcap.com') 41 | print('Stopping script.') 42 | print('') 43 | sys.exit() 44 | 45 | elif choice == 'Y': 46 | base_url = input('Enter the url of the first page of the bitcointalk thread you want to parse') 47 | else: 48 | print('Incorrect input. Cancelling script') 49 | print('') 50 | sys.exit() 51 | print('') 52 | 53 | print('The url of the first bitcointalk [ANN] thread is...') 54 | print(base_url) 55 | print('') 56 | print('') 57 | 58 | # Extract the number of pages in the bitcointalk.com thread 59 | forum_response = urlopen(base_url) 60 | html_parser = etree.HTMLParser() 61 | tree = etree.parse(forum_response, html_parser) 62 | index_table = tree.xpath('//div[@id="bodyarea"]/table')[0] 63 | 64 | page_numbers = [] 65 | for x in index_table.xpath('./tr/td/a/text()'): 66 | try: 67 | page_numbers.append(int(x)) 68 | except ValueError: 69 | pass 70 | num_of_thread_pages = max(page_numbers) 71 | urls = [base_url] + [base_url[:-1] + str(int(20 * (i - 1))) for i in range(2, num_of_thread_pages + 1)] 72 | 73 | print('This bitcointalk thread has ' + str(num_of_thread_pages) + ' pages.') 74 | print('') 75 | print('') 76 | 77 | with open('./bitcointalk_ANN/spiders/urls.pickle', 'wb') as handle: 78 | pickle.dump(urls, handle, protocol=pickle.HIGHEST_PROTOCOL) 79 | 80 | return num_of_thread_pages, crypto_currency 81 | 82 | 83 | def merge(crypto_currency): 84 | 85 | # Delete html file for the crypto-currency if exists 86 | try: 87 | base = r'C:\Users\Shasa\PycharmProjects\bitcointalk\bitcointalk_ANN' 88 | path = os.path.join(base, (crypto_currency + r'.html')) 89 | os.remove(path) 90 | except OSError: 91 | pass 92 | 93 | # Read in style html 94 | with open(r'C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/style.html', 'r') as f: 95 | style = f.read() 96 | f.close() 97 | 98 | # Read in all .html files of the posts 99 | page_paths = glob.glob('C:/Users/Shasa/PycharmProjects/bitcointalk/bitcointalk_ANN/bitcointalk_ANN/pages/*.html') 100 | page_paths = sorted(page_paths) 101 | number_of_pages = len(page_paths) 102 | 103 | # Add style html to crypto-currency html doc 104 | with open("C:/Users/Shasa/Dropbox/ANN/" + crypto_currency + ".html", "a", encoding="utf-8") as main: 105 | main.write("") 106 | main.write(style) 107 | 108 | main.write("") 109 | for path in page_paths: 110 | with open(path, "r", encoding="utf-8") as f: 111 | content = f.read() 112 | main.write(content) 113 | f.close() 114 | main.write("") 115 | main.write("") 116 | main.close() 117 | 118 | return number_of_pages 119 | 120 | 121 | def print_log(crypto_currency, num_of_thread_pages, num_of_scraped_pages): 122 | print('') 123 | print('********************************************') 124 | print(' Crypto: ' + crypto_currency.title()) 125 | print('********************************************') 126 | print(' Pages in bitcointalk thread: ' + str(num_of_thread_pages)) 127 | print('********************************************') 128 | print(' Pages scraped: ' + str(num_of_scraped_pages)) 129 | print('********************************************') 130 | print('') -------------------------------------------------------------------------------- /bitcointalk_ANN/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bitcointalk_ANN project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'bitcointalk_ANN' 13 | SPIDER_MODULES = ['bitcointalk_ANN.spiders'] 14 | NEWSPIDER_MODULE = 'bitcointalk_ANN.spiders' 15 | 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'bitcointalk_ANN (+http://www.yourdomain.com)' 19 | 20 | 21 | #*********************************************************************************************************************** 22 | 23 | # Retry many times since proxies often fail 24 | RETRY_TIMES = 10 25 | # Retry on most error codes since proxies fail for different reasons 26 | RETRY_HTTP_CODES = [500, 503, 504, 400, 403, 404, 408] 27 | 28 | DOWNLOADER_MIDDLEWARES = { 29 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 30 | 'scrapy_proxies.RandomProxy': 100, 31 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 32 | } 33 | 34 | DOWNLOADER_MIDDLEWARES = { 35 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 36 | 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, 37 | } 38 | 39 | # Proxy list containing entries like 40 | # http://host1:port 41 | # http://username:password@host2:port 42 | # http://host3:port 43 | # ... 44 | PROXY_LIST = './bitcointalk_ANN/proxy_list.txt' 45 | 46 | # Proxy mode 47 | # 0 = Every requests have different proxy 48 | # 1 = Take only one proxy from the list and assign it to every requests 49 | # 2 = Put a custom proxy to use in the settings 50 | PROXY_MODE = 0 51 | 52 | # If proxy mode is 2 uncomment this sentence : 53 | #CUSTOM_PROXY = "http://host1:port" 54 | 55 | #*********************************************************************************************************************** 56 | 57 | AUTOTHROTTLE_ENABLED = True 58 | AUTOTHROTTLE_START_DELAY = 1.0 59 | 60 | #*********************************************************************************************************************** 61 | 62 | # Obey robots.txt rules 63 | ROBOTSTXT_OBEY = True 64 | 65 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 66 | #CONCURRENT_REQUESTS = 32 67 | 68 | # Configure a delay for requests for the same website (default: 0) 69 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 70 | # See also autothrottle settings and docs 71 | #DOWNLOAD_DELAY = 3 72 | # The download delay setting will honor only one of: 73 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 74 | #CONCURRENT_REQUESTS_PER_IP = 16 75 | 76 | # Disable cookies (enabled by default) 77 | #COOKIES_ENABLED = False 78 | 79 | # Disable Telnet Console (enabled by default) 80 | #TELNETCONSOLE_ENABLED = False 81 | 82 | # Override the default request headers: 83 | #DEFAULT_REQUEST_HEADERS = { 84 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 85 | # 'Accept-Language': 'en', 86 | #} 87 | 88 | # Enable or disable spider middlewares 89 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 90 | #SPIDER_MIDDLEWARES = { 91 | # 'bitcointalk_ANN.middlewares.BitcointalkAnnSpiderMiddleware': 543, 92 | #} 93 | 94 | # Enable or disable downloader middlewares 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 96 | #DOWNLOADER_MIDDLEWARES = { 97 | # 'bitcointalk_ANN.middlewares.MyCustomDownloaderMiddleware': 543, 98 | #} 99 | 100 | # Enable or disable extensions 101 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 102 | #EXTENSIONS = { 103 | # 'scrapy.extensions.telnet.TelnetConsole': None, 104 | #} 105 | 106 | # Configure item pipelines 107 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 108 | ITEM_PIPELINES = { 109 | 'bitcointalk_ANN.pipelines.PostPipeline': 300, 110 | } 111 | 112 | # Enable and configure the AutoThrottle extension (disabled by default) 113 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 114 | #AUTOTHROTTLE_ENABLED = True 115 | # The initial download delay 116 | #AUTOTHROTTLE_START_DELAY = 5 117 | # The maximum download delay to be set in case of high latencies 118 | #AUTOTHROTTLE_MAX_DELAY = 60 119 | # The average number of requests Scrapy should be sending in parallel to 120 | # each remote server 121 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 122 | # Enable showing throttling stats for every response received: 123 | #AUTOTHROTTLE_DEBUG = False 124 | 125 | # Enable and configure HTTP caching (disabled by default) 126 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 127 | #HTTPCACHE_ENABLED = True 128 | #HTTPCACHE_EXPIRATION_SECS = 0 129 | #HTTPCACHE_DIR = 'httpcache' 130 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 131 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 132 | 133 | #STATS_ENABLED = True 134 | #STATS_DUMP = True 135 | #LOG_ENABLED = False 136 | #LOG_FILE = 'crawl.log' 137 | #LOG_STDOUT = False 138 | -------------------------------------------------------------------------------- /bitcointalk_ANN/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 21 | 22 | 23 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |