├── .gitignore ├── LICENSE ├── README.md ├── quora ├── Project_Quora │ ├── Project_Quora │ │ ├── __init__.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── quora.py │ │ │ └── topic │ │ │ ├── Computer-Programming.txt │ │ │ └── topic.py │ └── scrapy.cfg ├── README.md ├── quora.bat ├── requirements.txt └── topic.bat ├── scn ├── README.md ├── RUN.bat ├── discretizer │ ├── RUN.bat │ ├── discretization.py │ └── scn_discretizer.py ├── requirements.txt └── scnscraper │ ├── dataStoring.py │ ├── items.py │ ├── main.py │ └── scraper.py └── yahoo-answers ├── README.md ├── discretizer ├── discretizer.py └── main.py ├── requirements.txt ├── yahooscraper.sh ├── yahooscraper └── yahooscraper │ ├── scrapy.cfg │ └── yahooscraper │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── YahooScraper.py │ ├── __init__.py │ └── example_database.pdl ├── yahoourlextractor.sh └── yahoourlextractor ├── YahooUrlSearcher ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── yahoourlspider.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | .idea/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | .idea/ 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Collaborative Development Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # qa-scrapers 2 | 3 | A collection of Python scripts that leverage Selenium and/or Scrapy to scrape content from Question Answering sites, such as and other than Stack Overflow. 4 | 5 | So far, three scrapers are available: 6 | * [Yahoo! Answers](yahoo-answers/README.md) 7 | * [Quora](quora/README.md) 8 | * [SAP Community Network](scn/README.md) 9 | 10 | Please, refer to the README.md files within each subfolder for more. 11 | 12 | ## Fair use policy 13 | Pleace, cite the following paper if you decide to use these scripts for your own research purposes. 14 | 15 | > F. Calefato, F. Lanubile, N. Novielli. “[Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums](http://collab.di.uniba.it/fabio/wp-content/uploads/sites/5/2014/05/a13-calefato.pdf).” In *Proc. 10th Int’l Symposium on Empirical Softw. Eng. and Measurement (ESEM’16)*, Ciudad Real, Spain, Sept. 8-9, 2016, DOI:[10.1145/2961111.2962585](http://doi.acm.org/10.1145/2961111.2962585). 16 | 17 | ```latex 18 | @inproceedings{calefato_2016_esem, 19 | author = {Calefato, Fabio and Lanubile, Filippo and Novielli, Nicole}, 20 | title = {Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums}, 21 | booktitle = {Proc. of the 10th ACM/IEEE Int'l Symposium on Empirical Software Engineering and Measurement}, 22 | series = {ESEM '16}, 23 | year = {2016}, 24 | isbn = {978-1-4503-4427-2}, 25 | location = {Ciudad Real, Spain}, 26 | pages = {13:1--13:10}, 27 | articleno = {13}, 28 | numpages = {10}, 29 | url = {http://doi.acm.org/10.1145/2961111.2962585}, 30 | doi = {10.1145/2961111.2962585}, 31 | acmid = {2962585}, 32 | publisher = {ACM}, 33 | address = {New York, NY, USA}, 34 | keywords = {Best-answer prediction, Developer forums, Q\&A sites, Stack Overflow}, 35 | } 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/quora/Project_Quora/Project_Quora/__init__.py -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ProjectQuoraItem(scrapy.Item): 12 | uid = scrapy.Field() # Id of a question (e.g 1), Id of an answer (e.g 1.1) 13 | type = scrapy.Field() # question, answer 14 | author = scrapy.Field() # author of a question or an answer 15 | title = scrapy.Field() # title of a question, null for an answer 16 | text = scrapy.Field() # text of a question or an answer 17 | date_time = scrapy.Field() # when a question or an answer was written 18 | tags = scrapy.Field() # topics associated to the question, null for answer 19 | views = scrapy.Field() # views of a questions or an answer 20 | answers = scrapy.Field() # number of answers for a question, 0 for answers 21 | resolve = scrapy.Field() # always null 22 | upvotes = scrapy.Field() # likes for a question (null) or an answers 23 | url = scrapy.Field() # url of a question or an answer 24 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from pydblite import Base 9 | import os 10 | import json 11 | from scrapy.xlib.pydispatch import dispatcher 12 | from scrapy import signals 13 | 14 | 15 | class DBPipeline(object): 16 | # Pipeline to write an Item in the database 17 | def open_spider(self, spider): 18 | # Creation of DB 19 | self.db = Base(spider.database) 20 | self.db.create('uid', 'type', 'author', 'title', 'text', 'date_time', 21 | 'tags', 'views', 'answers', 'resolve', 'upvotes', 'url', 22 | mode="override") 23 | dispatcher.connect(self.spider_closed, signals.spider_closed) 24 | 25 | def process_item(self, item, spider): 26 | # Writing of the item 27 | self.db.insert(uid=item['uid'], 28 | type=item['type'], 29 | author=item['author'], 30 | title=item['title'], 31 | text=item['text'], 32 | date_time=item['date_time'], 33 | tags=item['tags'], 34 | views=item['views'], 35 | answers=item['answers'], 36 | resolve=item['resolve'], 37 | upvotes=item['upvotes'], 38 | url=item['url'] 39 | ) 40 | 41 | self.db.commit() 42 | return item 43 | 44 | def spider_closed(self, spider): 45 | # Number of items saved, shown at the end 46 | i = 0 47 | j = 0 48 | for r in self.db: 49 | 50 | if r["type"] == "question": 51 | i += 1 52 | else: 53 | j += 1 54 | 55 | print ('Number of questions and answers found:') 56 | print (str(i) + ' questions \n') 57 | print (str(j) + ' answers \n') 58 | 59 | 60 | class JsonWriterPipeline(object): 61 | # Pipeline to write an Item in Json File 62 | def __init__(self): 63 | if os.path.exists('items.json'): 64 | os.remove('items.json') 65 | 66 | self.file = open('items.json', 'wb') 67 | dispatcher.connect(self.spider_closed, signals.spider_closed) 68 | 69 | def process_item(self, item, spider): 70 | line = json.dumps(dict(item)) + "\n" 71 | self.file.write(line) 72 | return item 73 | 74 | def spider_closed(self, spider): 75 | self.file.close() 76 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Project_Quora project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Project_Quora' 13 | 14 | SPIDER_MODULES = ['Project_Quora.spiders'] 15 | NEWSPIDER_MODULE = 'Project_Quora.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Project_Quora (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | CONCURRENT_REQUESTS=1 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | DOWNLOAD_DELAY=0.5 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | COOKIES_ENABLED=True 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'Project_Quora.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'Project_Quora.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | ITEM_PIPELINES = { 65 | 'Project_Quora.pipelines.DBPipeline': 300, 66 | 'Project_Quora.pipelines.JsonWriterPipeline': 800, 67 | } 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 71 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 72 | #AUTOTHROTTLE_ENABLED=True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY=5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY=60 77 | # Enable showing throttling stats for every response received: 78 | #AUTOTHROTTLE_DEBUG=False 79 | 80 | # Enable and configure HTTP caching (disabled by default) 81 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 82 | #HTTPCACHE_ENABLED=True 83 | #HTTPCACHE_EXPIRATION_SECS=0 84 | #HTTPCACHE_DIR='httpcache' 85 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 86 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 87 | 88 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/quora.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import time 3 | import platform 4 | import scrapy 5 | import glob 6 | import html2text 7 | import parsedatetime as pdt 8 | from selenium.webdriver import DesiredCapabilities 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as ec 12 | from selenium import webdriver 13 | from selenium.common.exceptions import NoSuchElementException 14 | from selenium.common.exceptions import TimeoutException 15 | import codecs 16 | from ..items import ProjectQuoraItem 17 | import re 18 | from scrapy import signals 19 | from scrapy.xlib.pydispatch import dispatcher 20 | 21 | 22 | class QuoraSpider(scrapy.Spider): 23 | name = "quora" # Name of Spider 24 | allowed_domains = ["quora.com"] 25 | uid = 0 # Id of question-thread 26 | list_topic = [] 27 | database = '' 28 | 29 | # Creation of the list of topics 30 | if "Windows" == platform.system(): 31 | list_of_files = glob.glob('Topic/*.txt') 32 | else: 33 | list_of_files = glob.glob('Topic\*.txt') 34 | 35 | for filename in list_of_files: 36 | lines = open(filename, "r").readlines() 37 | for line in lines: 38 | list_topic.append("<" + line.rstrip('\n') + ">") 39 | 40 | list_topic = set(list_topic) 41 | 42 | def __init__(self, *args, **kwargs): 43 | super(QuoraSpider, self).__init__(*args, **kwargs) 44 | # Arguments passed through the batch file quora.bat 45 | self.database = kwargs.get('database') + '.pdl' 46 | email = kwargs.get('email') 47 | passw = kwargs.get('password') 48 | 49 | # When Spider quits will call the function spider_closed() 50 | dispatcher.connect(self.spider_closed, signals.spider_closed) 51 | 52 | # Opening PhantomJS webdriver with certain settings 53 | options = ['--proxy-type=none', '--load-images=false'] 54 | if platform.system() == "Windows": 55 | self.driver = webdriver.PhantomJS(service_args=options) 56 | else: 57 | self.driver = webdriver.PhantomJS(executable_path='./phantomjs', 58 | service_args=options) 59 | self.driver.set_window_size(1920, 1080) 60 | self.wait = WebDriverWait(self.driver, 60) 61 | 62 | # Access to Quora and Login 63 | self.driver.get("http://www.quora.com/") 64 | self.driver.refresh() 65 | time.sleep(2) 66 | 67 | print ('Login to Quora..') 68 | while True: 69 | # Entering your username and password 70 | form = self.driver.find_element_by_class_name('login') 71 | 72 | username = form.find_element_by_name('email') 73 | username.send_keys(email) 74 | time.sleep(2) 75 | password = form.find_element_by_name('password') 76 | password.send_keys(passw) 77 | 78 | time.sleep(2) 79 | form.find_element_by_xpath( 80 | ".//input[contains(@value, 'Login')]").click() 81 | time.sleep(2) 82 | 83 | try: 84 | if self.driver.find_element_by_css_selector( 85 | 'div[id*="_error"]').is_displayed(): 86 | self.driver.refresh() 87 | print ('Login Error.Retry') 88 | email = raw_input("Insert username: ") 89 | passw = raw_input("Insert password: ") 90 | except NoSuchElementException: 91 | break 92 | 93 | def start_requests(self): 94 | # Request for parsing the '/all-questions' section of a topic 95 | 96 | for filename in self.list_of_files: 97 | filename = filename.replace('\\', '') 98 | filename = filename.replace('/', '') 99 | filename = filename.replace('Topic', '') 100 | filename = filename.replace('.txt', '') 101 | yield scrapy.Request('https://www.quora.com/topic/' + 102 | filename + '/all_questions', self.parse) 103 | 104 | def spider_closed(self, spider): 105 | self.driver.close() 106 | 107 | def parse(self, response): 108 | # Opening the '/all-questions' section of a topic 109 | self.driver.get(response.url) 110 | 111 | old_position = self.driver.execute_script( 112 | "return document.body.scrollHeight") 113 | 114 | # Scroll-down with with Selenium 115 | while True: 116 | self.driver.execute_script( 117 | "window.scrollTo(0, document.body.scrollHeight);") 118 | 119 | # Visibility of feedback at the bottom of the page after the scroll 120 | # Wait until is visible 121 | if self.driver.find_element_by_xpath( 122 | '//div[contains(@class,"pager_next")]').is_displayed(): 123 | try: 124 | self.wait.until(ec.invisibility_of_element_located( 125 | (By.CLASS_NAME, "pager_next"))) 126 | except TimeoutException: 127 | self.driver.refresh() 128 | 129 | time.sleep(1) 130 | new_pos = self.driver.execute_script( 131 | "return document.body.scrollHeight") 132 | 133 | # Check the size of the page 134 | # If the dimensions are the same, stop the scroll-down 135 | if new_pos == old_position: 136 | sleep = 0 137 | self.driver.execute_script( 138 | "$('html,body').animate({scrollTop: 0}, 2000);") 139 | time.sleep(randint(4, 9)) 140 | 141 | while self.driver.execute_script( 142 | "return document.body.scrollHeight") == old_position \ 143 | and sleep != 100: 144 | self.driver.execute_script( 145 | "window.scrollTo(0, document.body.scrollHeight);") 146 | time.sleep(1) 147 | sleep += 1 148 | 149 | if sleep == 100: 150 | break 151 | 152 | old_position = self.driver.execute_script( 153 | "return document.body.scrollHeight") 154 | post_elems = self.driver.find_elements_by_class_name( 155 | "pagedlist_item") 156 | print ('Question found: ' + str(len(post_elems))) 157 | 158 | # Extraction of urls questions with selectors 159 | post_elems = self.driver.find_elements_by_class_name("pagedlist_item") 160 | url_list = [] 161 | for post in post_elems: 162 | url_list.append(post.find_element_by_xpath( 163 | './/a[contains(@class,"question_link")]') 164 | .get_attribute('href')) 165 | url_list = set(url_list) 166 | 167 | # Request for parsing the question-thread 168 | for url in url_list: 169 | url_scrapy = response.urljoin(url) 170 | yield scrapy.Request(url_scrapy, callback=self.parse_question) 171 | 172 | def parse_question(self, response): 173 | # Creation of the list of tags of the question 174 | tag_string = "" 175 | tags = response.xpath('//div[contains(@class,' + 176 | '"QuestionTopicHorizontalList TopicList")]' + 177 | '//span[contains(@class,' + 178 | ' "TopicNameSpan TopicName")]/text()').extract() 179 | for tag in tags: 180 | tag_string = tag_string + "<" + tag.encode('utf8') + "> " 181 | 182 | found = False 183 | for topic in self.list_topic: 184 | if topic in tag_string: 185 | found = True 186 | break 187 | ''' 188 | The question will be scanned if it has at least one topic in list_topic 189 | ''' 190 | if found: 191 | # Related questions 192 | url_related = response.xpath('//li[contains(@class,' + 193 | '"related_question")]' + 194 | '//a[contains(@class, ' + 195 | '"question_link")]/@href').extract() 196 | # Request for parsing the related question-threads 197 | for url in url_related: 198 | url_scrapy = response.urljoin(url) 199 | yield scrapy.Request(url_scrapy, callback=self.parse_question) 200 | 201 | # Page loading of question-thread 202 | self.driver.get(response.url) 203 | right_content = self.driver. \ 204 | find_element_by_xpath('//div[contains(@class,' + 205 | '"HighlightsSection SimpleToggle ' + 206 | 'Toggle")]') 207 | # Show the content of a Rigth Side bar 208 | try: 209 | if right_content.find_element_by_xpath( 210 | './/span/a[contains(@class,"expand_link")]') \ 211 | .is_displayed(): 212 | 213 | more_btn = right_content.find_element_by_xpath( 214 | './/span/a[contains(@class,"expand_link")]') 215 | 216 | while True: 217 | try: 218 | self.wait.until(ec.element_to_be_clickable( 219 | (By.XPATH, 220 | '//span/a[contains(@class,"expand_link")]'))) 221 | break 222 | except TimeoutException: 223 | self.driver.refresh() 224 | 225 | webdriver.ActionChains(self.driver).move_to_element( 226 | more_btn).click(more_btn).perform() 227 | 228 | self.wait.until(ec.invisibility_of_element_located( 229 | (By.XPATH, more_btn))) 230 | time.sleep(1) 231 | 232 | right_content = self.driver.find_element_by_xpath( 233 | '//div[contains(@class,' + 234 | '"QuestionPageRightLoggedInSidebar")]') 235 | right_content = right_content.find_element_by_css_selector( 236 | 'div[id*="_expanded"]') 237 | 238 | except NoSuchElementException: 239 | right_content = self.driver.find_element_by_xpath( 240 | '//div[contains(@class,' + 241 | '"QuestionPageRightLoggedInSidebar")]') 242 | right_content = right_content.find_element_by_css_selector( 243 | 'div[id*="__truncated"]') 244 | 245 | # Set the properties of Html2text 246 | item_list = [] 247 | h = html2text.HTML2Text() 248 | h.emphasis = True 249 | h.bypass_tables = False 250 | h.ignore_emphasis = False 251 | h.body_width = 0 252 | h.single_line_break = True 253 | h.bypass_tables = False 254 | h.ignore_images = False 255 | h.images_with_size = True 256 | h.inline_links = True 257 | h.protect_links = True 258 | 259 | # Set the properties Parsedatetime 260 | c = pdt.Constants() 261 | c.YearParseStyle = 0 262 | c.DOWParseStyle = 0 263 | c.CurrentDOWParseStyle = True 264 | p = pdt.Calendar(c) 265 | f = '%Y-%m-%d %H:%M:%S' 266 | 267 | self.uid += 1 268 | try: 269 | answers = self.driver.find_elements_by_xpath( 270 | '//div[contains(@class, "Answer AnswerBase")]') 271 | except NoSuchElementException: 272 | answers = [] 273 | 274 | if len(answers) > 0: 275 | old_position = self.driver.execute_script( 276 | "return document.body.scrollHeight") 277 | 278 | # Scroll the page of question-thread 279 | while True: 280 | self.driver.execute_script( 281 | "window.scrollTo(0, document.body.scrollHeight);") 282 | if self.driver.find_element_by_xpath( 283 | '//div[contains(@class,"pager_next")]') \ 284 | .is_displayed(): 285 | try: 286 | self.wait.until(ec.invisibility_of_element_located( 287 | (By.CLASS_NAME, "pager_next"))) 288 | except TimeoutException: 289 | self.driver.refresh() 290 | 291 | time.sleep(1) 292 | new_pos = self.driver.execute_script( 293 | "return document.body.scrollHeight") 294 | if new_pos == old_position: 295 | break 296 | old_position = self.driver.execute_script( 297 | "return document.body.scrollHeight") 298 | 299 | grid = self.driver.find_element_by_class_name('AnswerListDiv') 300 | answers = grid.find_elements_by_xpath( 301 | './/div[contains(@class, "Answer AnswerBase")]') 302 | try: 303 | self.wait.until(ec.invisibility_of_element_located( 304 | (By.CLASS_NAME, "toggled_spinner"))) 305 | except TimeoutException: 306 | pass 307 | time.sleep(0.5) 308 | 309 | # Creation of ITEM QUESTION 310 | itemquest = ProjectQuoraItem() 311 | question = self.driver.find_element_by_class_name('QuestionArea') 312 | 313 | itemquest['uid'] = str(self.uid) 314 | itemquest['type'] = "question" 315 | try: 316 | author = right_content.find_element_by_xpath( 317 | './/div[contains(@class, "FollowerFacepile clearfix")]' + 318 | '//img[contains(@class, "profile_photo_img")]') 319 | itemquest['author'] = author.get_attribute('alt').encode( 320 | 'utf8', 'ignore') 321 | except NoSuchElementException: 322 | itemquest['author'] = "Anonymous" 323 | pass 324 | 325 | try: 326 | for elem in right_content.find_elements_by_xpath( 327 | './/div[contains(@class, "HighlightRow")]'): 328 | if " View" in elem.text.encode('utf8'): 329 | view = elem.text.encode('utf8') 330 | view = re.match(r'(.*) View.*', view) 331 | itemquest['views'] = int( 332 | view.group(1).replace(',', '')) 333 | except NoSuchElementException: 334 | itemquest['views'] = 0 335 | pass 336 | 337 | try: 338 | date_time = right_content.find_element_by_xpath( 339 | './/div[contains(@class, "HighlightRow AskedRow")]') \ 340 | .text.encode('utf8') 341 | date_time = re.sub(re.compile('Last asked: '), '', date_time) 342 | data_format = p.parseDT(date_time) 343 | itemquest['date_time'] = data_format[0].strftime(f) 344 | except NoSuchElementException: 345 | itemquest['date_time'] = '0000-00-00 00:00:00' 346 | pass 347 | 348 | try: 349 | itemquest['title'] = question.find_element_by_xpath( 350 | './/span[contains(@class, "inline_editor_value")]/h1') \ 351 | .text.encode('utf8', 'ignore') 352 | except NoSuchElementException: 353 | itemquest['title'] = 'null' 354 | pass 355 | 356 | try: 357 | content = question.find_element_by_css_selector( 358 | 'div[id*="full_text"]') 359 | 360 | # Inserting markdown to delimit the code 361 | html_string = content.get_attribute('innerHTML') 362 | html_string = re.sub( 363 | re.compile('.*?', re.DOTALL), '', 364 | html_string) 365 | html_string = re.sub(r'
    (.*?)
', 366 | r'```\1```', html_string) 367 | html_string = re.sub( 368 | r'
(.*?)
', 369 | r'`\1`', html_string) 370 | html_string = html_string.replace('
', '')
371 |                 html_string = html_string.replace('
', '') 372 | html_string = re.sub(r'\[code\](.*?)\[/code\]', r'```\1```', 373 | html_string) 374 | html_string = re.sub(r'(.*?)', 375 | r'```\1```', html_string) 376 | html_string = re.sub( 377 | r'
(.*?)
', 378 | r'`\1`', html_string) 379 | 380 | if (h.handle(html_string) != '\n\n' or 381 | h.handle(html_string != '\n')): 382 | itemquest['text'] = h.handle(html_string) \ 383 | .encode('utf8', 'ignore') 384 | else: 385 | itemquest['text'] = 'null' 386 | except NoSuchElementException: 387 | itemquest['text'] = 'null' 388 | pass 389 | 390 | itemquest['tags'] = tag_string.encode('utf8') 391 | itemquest['answers'] = len(answers) 392 | itemquest['resolve'] = 'null' 393 | itemquest['upvotes'] = 0 394 | itemquest['url'] = response.url 395 | 396 | item_list.append(itemquest) 397 | 398 | # Creation of N-ITEM ANSWER 399 | if len(answers) > 0: 400 | i = 1 401 | for ans in answers: 402 | itemans = ProjectQuoraItem() 403 | itemans['uid'] = str(self.uid) + "." + str(i) 404 | itemans['type'] = "answer" 405 | 406 | try: 407 | itemans['author'] = ans.find_element_by_xpath( 408 | './/img[contains(@class, "profile_photo_img")]') \ 409 | .get_attribute('alt').encode('utf8', 'ignore') 410 | except NoSuchElementException: 411 | itemans['author'] = "Anonymous" 412 | pass 413 | 414 | itemans['title'] = 'null' 415 | 416 | try: 417 | if ans.find_element_by_xpath( 418 | './/a[contains(@class, "more_link")]') \ 419 | .is_displayed(): 420 | more = ans.find_element_by_xpath( 421 | './/a[contains(@class, "more_link")]') 422 | self.driver.execute_script( 423 | "arguments[0].scrollIntoView(true);", more) 424 | self.driver.execute_script( 425 | "window.scrollBy(0,-250);") 426 | 427 | webdriver.ActionChains(self.driver) \ 428 | .move_to_element(more) \ 429 | .click(more).perform() 430 | 431 | self.wait.until(ec.invisibility_of_element_located( 432 | (By.CLASS_NAME, 'loading'))) 433 | time.sleep(1) 434 | except NoSuchElementException: 435 | pass 436 | 437 | try: 438 | content = ans.find_element_by_class_name( 439 | 'inline_editor_value') 440 | 441 | # Inserting markdown to delimit the code 442 | html_string = content.get_attribute('innerHTML') 443 | html_string = re.sub(re.compile( 444 | '
.*?
', 445 | re.DOTALL), '', html_string) 446 | html_string = re.sub( 447 | re.compile('.*?', 448 | re.DOTALL), '', html_string) 449 | html_string = re.sub(re.compile( 450 | '
', 451 | re.DOTALL), '', html_string) 452 | html_string = re.sub( 453 | '', 454 | '', html_string) 455 | html_string = re.sub( 456 | r'
    (.*?)
', 457 | r'```\1```', html_string) 458 | html_string = re.sub( 459 | r'
(.*?)
', 460 | r'`\1`', html_string) 461 | html_string = html_string.replace('
', '')
462 |                         html_string = html_string.replace('
', '') 463 | html_string = re.sub(r'\[code\](.*?)\[/code\]', 464 | r'```\1```', html_string) 465 | html_string = re.sub(r'(.*?)', 466 | r'```\1```', html_string) 467 | html_string = re.sub( 468 | r'
(.*?)
', 469 | r'`\1`', html_string) 470 | 471 | itemans['text'] = h.handle(html_string). \ 472 | encode('utf8', 'ignore') 473 | except NoSuchElementException: 474 | itemans['text'] = 'null' 475 | pass 476 | 477 | try: 478 | date_time = content.find_element_by_class_name( 479 | 'answer_permalink').text.encode('utf8') 480 | date_time = re.sub(re.compile('Written '), '', 481 | date_time) 482 | date_time = re.sub(re.compile('Updated '), '', 483 | date_time) 484 | data_format = p.parseDT(date_time) 485 | itemans['date_time'] = data_format[0].strftime(f) 486 | except NoSuchElementException: 487 | itemans['date_time'] = '0000-00-00 00:00:00' 488 | pass 489 | 490 | itemans['tags'] = 'null' 491 | views = ans.find_element_by_class_name( 492 | 'CredibilityFact').text.encode('utf8') 493 | 494 | try: 495 | if 'k' in views: 496 | match = re.search(r'(.*?)k Views', views) 497 | views = int(float(match.group(1)) * 1000) 498 | else: 499 | match = re.search(r'(.*?) Views', views) 500 | views = int(match.group(1)) 501 | except AttributeError: 502 | views = 0 503 | pass 504 | 505 | itemans['views'] = views 506 | itemans['answers'] = 0 507 | itemans['resolve'] = 'null' 508 | 509 | upvotes = ans.find_element_by_xpath( 510 | './/div[contains(@class,"action_bar_inner")]' + 511 | '/span/a/span[2]').text.encode('utf8') 512 | 513 | if len(upvotes) > 0: 514 | if 'k' in upvotes: 515 | upvotes = re.sub(re.compile('k'), '', upvotes) 516 | upvotes = int(float(upvotes) * 1000) 517 | itemans['upvotes'] = upvotes 518 | else: 519 | itemans['upvotes'] = int(upvotes) 520 | else: 521 | itemans['upvotes'] = 0 522 | 523 | itemans['url'] = ans.find_element_by_class_name( 524 | 'answer_permalink').get_attribute('href') \ 525 | .encode('utf8') 526 | 527 | i += 1 528 | item_list.append(itemans) 529 | 530 | # Release of the items instantiated 531 | for item in item_list: 532 | yield item 533 | print "\n" 534 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/topic/Computer-Programming.txt: -------------------------------------------------------------------------------- 1 | npm (package manager) 2 | GPU Computation 3 | Ruby on Rails (web framework) 4 | contentEditable 5 | Glasgow Haskell Compiler 6 | Object-Oriented Software Construction 7 | Practice of Computer Programming 8 | Specific Projects Using Clojure 9 | Browser Compatibility 10 | Mercurial (software) 11 | Smalltalk (programming language) 12 | Learning C++ 13 | CSS Rotate 14 | JavaScript Minification 15 | Iframes 16 | Facebook Bots 17 | PySparse 18 | Fluid Layout 19 | Psyco (Python compiler) 20 | Appectual IT Solutions 21 | OpenGL 22 | HipHop for PHP 23 | USA Computing Olympiad (USACO) 24 | Meteor (Javascript platform) 25 | JavaScript Libraries 26 | PHP Performance 27 | Learning Ruby 28 | Jackson JSON Processor 29 | DirectCompute 30 | Capybara (testing framework) 31 | Python 3 32 | Debugging 33 | ECMAScript 3 34 | ECMAScript 2 35 | ECMAScript 1 36 | ECMAScript 7 37 | ECMAScript 6 38 | ECMAScript 5 39 | Sencha Touch 40 | JIT 41 | Groovy (programming language) 42 | Loop (programming) 43 | HBase 44 | Akka 45 | Learning BASIC 46 | JSON-LD 47 | Sample Code 48 | DataMapper 49 | Java Mobile Apps 50 | Hydra 51 | Pure (Programming Language) 52 | URL Rewriting 53 | Spring Framework 54 | Scalaz 55 | Nitro (JavaScript engine) 56 | Recurse Center 57 | Web Application Frameworks 58 | Programming JavaScript Applications (2014 book) 59 | Codeforces 60 | Backbone.js 61 | Blockly 62 | JavaScript Application Development 63 | ECMAScript Proxies 64 | Python Programming 65 | GitHub Student Developer Pack 66 | Django 1.4 67 | EventMachine 68 | Learning Processing 69 | ECMAScript 70 | SpiderMonkey (JavaScript engine) 71 | Pyramid (web framework) 72 | NowJS 73 | Head First JavaScript Programming (2014 book) 74 | GNU grep 75 | AMQP 76 | Programming Bootcamps 77 | Quixey Challenge 78 | Web Scraping 79 | Berkeley DB 80 | Ruby Blocks 81 | HTML 82 | Cascalog 83 | Twilio 84 | Serialization 85 | CouchApps 86 | Threading in Python 87 | NOLOH 88 | Modula-3 89 | YUI (Yahoo! User Interface) Library 90 | Fabric (software) 91 | Microdata 92 | Competitive Programmers 93 | Haskell in Industry 94 | Node.js Modules 95 | jclouds 96 | HTML5 Canvas Element 97 | Titan (graph database) 98 | Brian Bi 99 | Brogrammers 100 | Apache 2.0 License 101 | Esotech 102 | Bit Manipulations 103 | DirectX 104 | Verilog 105 | Regular Expressions in Python 106 | Cucumber (BDD framework) 107 | double (data type) 108 | MetroTwit 109 | Hour of Code 2013 110 | YourKit 111 | Orange (Python library) 112 | Python (programming language) 113 | Scaloid (library) 114 | WHATWG 115 | Scratch (programming language) 116 | Web Development Comparison 117 | Twilio Apps 118 | Semantic MediaWiki 119 | TopCoder 120 | D3.js (JavaScript library) 121 | C (programming language) 122 | APL (programming language) 123 | Knockout (JavaScript framework) 124 | Scripting Languages 125 | WSGI Middleware 126 | Pastek 127 | Fast Inverse Square Root 128 | Yii 129 | Computer Programmers 130 | Training for Competitive Programming 131 | Music APIs 132 | MashupXFeed 133 | Dnode 134 | PyCascading 135 | CherryPy 136 | Parallel Patterns Library (Visual C++) 137 | DirectX 11 138 | YUIDoc 139 | Carakan (JavaScript engine) 140 | Language-Specific Cloud APIs 141 | Codecademy 142 | Programming Libraries 143 | JSP 144 | AngularJS 145 | Emacs Lisp 146 | Rexster (Tinkerpop) 147 | Prolog 148 | ECMAScript Implementations 149 | Erlang (programming language) 150 | Indonesia's Selection Process for IOI 151 | Google Scholar API 152 | Mashery 153 | Flatiron School 154 | Xoops 155 | ASP.NET 156 | Static Code Analysis 157 | JavaScript Engines 158 | API Management 159 | Tawesoft 160 | Tomorrow People (company) 161 | Learning COBOL 162 | Rhino (JavaScript) 163 | C++11 (programming language) 164 | ABAP 165 | Microformats 166 | Yukihiro Matsumoto 167 | WebLogic 168 | Learning to Program 169 | JSONP 170 | Neo4j 171 | Major Concepts in Programming Languages 172 | ScriptRock 173 | PyQt 174 | Objective-C (programming language) 175 | fmdb 176 | enStratus API 177 | HTML5 Mobile 178 | XHP 179 | Programming Advice 180 | Online Judges 181 | Processing.js 182 | Twitter Streaming API 183 | Boot Loaders 184 | SWIG (software) 185 | Regular Expressions (computing) 186 | Groovy Frameworks 187 | lxml 188 | Web Programming Languages 189 | Twitter OAuth 190 | Apache Qpid 191 | Suggestions for an Ideal Website 192 | Sexism and Turmoil at GitHub (March 2014) 193 | Github Corporate Affairs 194 | MailChimp (product) 195 | Software Transactional Memory 196 | PHP Developers 197 | Google+ API 198 | vCloud 199 | Learning Java 200 | Prototype (framework) 201 | Principles of Object-Oriented Programming in JavaScript (2014 book) 202 | ACM-ICPC 203 | StackBlaze 204 | Beautiful Soup 205 | Hack Reactor 206 | Regular Expressions in JavaScrpt 207 | Semantics (computer science) 208 | Investing in Github 209 | ECMAScript 4 210 | RequireJS 211 | WebSockets 212 | LuaJIT 213 | CSS3 214 | Python 2.6 215 | Python 2.7 216 | Git (revision control) 217 | Dirty Checking (programming) 218 | CSS Shadows 219 | Programming Interview Questions 220 | MadMimi 221 | JavaScript Application Design (2015 book) 222 | Dojo (JavaScript toolkit) 223 | Zope Object Database 224 | cascading.jruby 225 | MS Access VBA 226 | Rails 3.1 227 | Pearson APIs 228 | Jinja 229 | TurboGears 230 | ECMAScript 5.1 231 | Capistrano 232 | HTML5 Video 233 | Microsoft Application Programming Interface 234 | Using JavaScript with .NET 235 | Google Programming Contest 236 | How to Code X 237 | Netflix API 238 | Learning HTML 239 | Io (programming language) 240 | Test::Unit 241 | Ruby (programming language) 242 | Toronto SEO 243 | Lua (programming language) 244 | Drupal Commerce 245 | Java Specification Request 246 | Browser-based Games 247 | Web Application Architecture 248 | CodeEval 249 | Scikits 250 | Media Queries 251 | Object Oriented Data Technology 252 | Facebook and HTML5 253 | XSL FO 254 | Silverlight 255 | OpenCL 256 | Github Products and Services 257 | C-Based Programming Languages 258 | PhantomJS 259 | Ext JS 260 | Qubole 261 | Dylan (programming language) 262 | Functional Programming in Industry 263 | Competitive Programming 264 | WAMP 265 | Needlebase 266 | IronPython 267 | Autohotkey 268 | Topic Maps 269 | Programming Bootcamps in Boston 270 | Object (programming concept) 271 | Zotonic 272 | JavaScript (programming language) 273 | Zend Framework 274 | SimpleXML 275 | Kohana 276 | Flickr API 277 | Cloud9 IDE 278 | Webmachine 279 | Parsing (computer science) 280 | Learn You a Haskell (2011 book) 281 | jQuery 282 | Indonesia's National Olympiad in Informatics (OSN Informatika) 283 | Web Development Companies 284 | Velocity (JavaScript Animation Frame... 285 | ClojureScript 286 | Learning Swift 287 | eZ Teamroom 288 | App Academy 289 | Code Year 290 | Bottle (web framework) 291 | Clojure (programming language) 292 | Arrays (programming) 293 | Redev 294 | Play Framework 295 | SGML 296 | Software Libraries 297 | Ramaze 298 | WebSphere MQ 299 | Codecademy JavaScript Exercises 300 | Java Frameworks 301 | Method (computer programming) 302 | GitHub Issues 303 | Schizophrenia (programming concept) 304 | HTML5 WYSIWYG Editors 305 | Scraping Technology 306 | uWSGI 307 | Source Code 308 | ICEfaces 309 | Application Binary Interface 310 | Cassandra (database) 311 | ECMAScript Features 312 | .NET Framework 313 | LAMP (software bundle) 314 | Learning SQL 315 | Inheritance (Programming concept) 316 | Future of Web Development 317 | Mod_rewrite 318 | Twisted (software) 319 | Visual Basic 320 | Context.IO 321 | Elegant Code 322 | Gems (Ruby) 323 | eZ Find 324 | Cloud APIs 325 | Functional Programming 326 | Semantic HTML 327 | Alternatives to Twilio 328 | Nexmo API 329 | Delphi (Programming Language) Forums 330 | Hibernate (Java) 331 | Newbox Solutions 332 | Sammy 333 | WS-Factory 334 | Terrastore 335 | IDLs 336 | jQuery Mobile 337 | Siri API 338 | Testtopicforcodeblocks 339 | Pinax 340 | Scala Actors 341 | Learning C# 342 | ASP.NET MVC 343 | Preparing for International Olympiad in Informatics 344 | Quadrax (Tetris clone) 345 | eZ Community 346 | Tiny Frameworks 347 | Arduino and Processing 348 | Testing Frameworks 349 | Distributed Revision Control Systems 350 | Chakra (JavaScript engine) 351 | X10 (programming language) 352 | Ruby Koans 353 | Compojure 354 | Perl 5 355 | Responsive HTML5 Web Templates 356 | Markup Languages 357 | Mutexes 358 | Tumult, Inc. 359 | F# (programming language) 360 | WebCreators.in 361 | Pointers (computer programming) 362 | SQLAlchemy 363 | Pylons (web framework) 364 | PHP (programming language) 365 | JRuby 366 | Commerce Kickstart 367 | Outlook VBA 368 | Mutual Exclusion (software) 369 | MusicBrainz 370 | Specific Countries' Selection Process for IOI 371 | Sequel (software) 372 | Paperclip Rails 373 | 140 Proof 374 | Go (programming language) 375 | Indexer (programming) 376 | HTML Email 377 | Sign in with Twitter 378 | Sinatra (software) 379 | github3.py 380 | Cake Software Foundation 381 | Google APIs 382 | Apache Thrift 383 | JavaScript Books 384 | Monads 385 | RubyMotion 386 | ActiveRecord 387 | Libcloud 388 | Open Graph 389 | CasperJS 390 | SFINAE (Substitution Failure is not an Error) 391 | Ruby vs. Groovy 392 | V8 (JavaScript engine) 393 | FP Complete 394 | Qt (framework) 395 | TestNG 396 | Subversion 397 | Browser Cookies 398 | D (programming language) 399 | Brogramming 400 | Web Architects 401 | Meta Tags 402 | Semantic Advertising 403 | Gremlin 404 | PeakStream 405 | Web Development Educational Resources 406 | Techtic Solutions 407 | CSS3 Animations 408 | Dasein Cloud 409 | JSLint 410 | Dynamic Code Analysis 411 | ECMAScript Classes 412 | Code Composer Studio (CCS) 413 | Dapper 414 | Gensim 415 | Learning Scala 416 | LayerVault 417 | Amiral Agence Web 418 | LLVM 419 | Visual Basic for Applications (VBA) 420 | GitHub Pages 421 | gitignore 422 | ELF 423 | Unladen Swallow 424 | EJB 425 | Building Social Networking Sites 426 | Syllabontes 427 | Erepublik 428 | Celery (distributed task queue) 429 | Zope 430 | RPython 431 | Real World Haskell (2008 book) 432 | Programming Frameworks 433 | C vs. C++ 434 | Web Consultants 435 | Compute Unified Device Architecture (CUDA) 436 | char (data type) 437 | GitHub Raises Venture Capital (July 2012) 438 | Avro (software) 439 | CSS Sprites 440 | Twitter API 441 | Chrome Frame 442 | R versus Python 443 | Crocodoc 444 | Type Theory 445 | Learning Delphi 446 | Claim Soluciones 447 | WebPageTest 448 | Google Earth API 449 | Java Libraries 450 | Lift (web framework) 451 | Racket (programming language) 452 | Coding Conventions 453 | Java Developers 454 | GCC (compiler) 455 | Mobile Recharge API 456 | Assembly Language 457 | Node.js Web Frameworks 458 | web.py 459 | JavaScript Frameworks 460 | Java Platform, Enterprise Edition 461 | Functional Programming in Scala (2014 book) 462 | Lisp (programming language) 463 | TypeScript 464 | Dbpedia 465 | Web Testing Framework 466 | string (data type) 467 | 2600hz 468 | ZeroMQ 469 | PHP Frameworks 470 | Programming Syntax 471 | Compiler Optimization 472 | FORTRAN (programming language) 473 | Grok 474 | Pinterest API 475 | BigDecimal 476 | Native Extensions for Microsoft Silverlight 477 | Datomic 478 | Moonstalk 479 | Object Inheritance 480 | Matplotlib 481 | Routes (Software) 482 | Opa 483 | QBasic 484 | Common Data Types in Computer Programming 485 | Typica 486 | Kivy 487 | Agda 488 | Netduino 489 | Groupcache 490 | MongoKit 491 | Google Maps API 492 | Learning Lisp 493 | China's Selection Process for IOI 494 | LOGO (programming language) 495 | Lithium Framework 496 | Major Concepts in Computer Programming 497 | Processing (programming language) 498 | India's Selection Process for IOI 499 | OCaml (programming language) 500 | RabbitMQ 501 | Mirah 502 | Dryad 503 | XHTML 504 | Web Development on Mac OS X 505 | Breezi 506 | Cramp 507 | Bluestar Applications 508 | Types of Computer Programming 509 | Uber API 510 | C++ (programming language) 511 | Computer Programming 512 | Protocol Buffers 513 | Dart (programming language) 514 | SymPy 515 | Mecury (programming language) 516 | nginx 517 | Git Merge Tools 518 | Syntactic Sugar (programming) 519 | Chess Programming 520 | Stackless Python 521 | Sphinx (Python documentation generator) 522 | Python Versions 523 | Void Pointer 524 | APIs, How To 525 | Hyperlinks 526 | Java Specifications 527 | Ruby on Rails Professionals 528 | SproutCore 529 | jemalloc 530 | CodeChef 531 | Bulbs (programming library) 532 | Ruby on Rails Plugins 533 | HTML5 Document Viewer 534 | Revision Control Systems 535 | Apple Swift (programming language) 536 | RESTful APIs 537 | CoffeeScript 538 | Open APIs 539 | Attract Group 540 | Foursquare API 541 | Hour of Code 542 | Table Tags 543 | CouchDB 544 | Criticism of PHP 545 | Selenium (testing framework) 546 | WebFont Loader 547 | 3scale 548 | CloudStack API 549 | Prototypal Inheritance 550 | Web APIs 551 | Ruby 1.9 552 | Memoization 553 | GitEnterprise 554 | Online Programming Bootcamps 555 | Read-Eval-Print Loops 556 | DreamFace Interactive 557 | Alembic 558 | Programming Language Design 559 | Pandas (Python Library) 560 | Java Applets 561 | Zero-day Attacks 562 | Anti-Patterns 563 | Windows Presentation Foundation 564 | Rubber Duck Debugging 565 | Webix 566 | Programming Language Adoption 567 | SML/NJ 568 | Ruby on Rails 4 569 | eZ Publish 570 | Functional Programming Principles in Scala (Coursera course) 571 | D.Labs 572 | WOEID 573 | LaCroix Design Company 574 | MooTools 575 | WSGI 576 | This (programming concept) 577 | Intel Acquires Mashery (April 2013) 578 | Tag Management 579 | Pygame 580 | OpenGL ES 2.0 581 | Adodb 582 | Eiffel (programming language) 583 | jQuery Plugins 584 | GitHub 585 | Ember.js 586 | Node.io 587 | list (data type) 588 | humans.txt 589 | BASIC (programming language) 590 | Wand (ImageMagick binding) 591 | JSON 592 | Twilio Revenue 593 | Gennady Korotkevich (competitive programmer) 594 | Enums 595 | XML 596 | Starter League 597 | Java Virtual Machine (JVM) 598 | Learning Perl 599 | OpenGL on iOS 600 | Garbage Collection (programming) 601 | HotSpot (JVM) 602 | Linus Torvalds 603 | Typesafe (company) 604 | Web IM 605 | Qt Quick 606 | FDT 607 | Scalding 608 | Tornado (web framework) 609 | Flask (Python framework) 610 | Programming for Kids 611 | CSS Frameworks 612 | oXygen XML 613 | Programming in C++ 614 | DataNucleus 615 | SMS API 616 | Programming Competitions 617 | Freebase 618 | Gosu (programming language) 619 | Java (programming language) 620 | Aspect-Oriented Programming 621 | Pointers in Structures 622 | malloc 623 | Core Data 624 | Facebook Hacker Cup 625 | Steak (Ruby gem) 626 | Scheme (programming language) 627 | GeoCouch 628 | Darcs 629 | Concurrency (computer science) 630 | Hackbright Academy 631 | JATS (Journal Article Tag Suite) 632 | Object-Oriented Programming 633 | Andrew Tridgell 634 | Mobile UI Design 635 | Jython 636 | Title Tags 637 | Memory Management (computer programming) 638 | Google's Polymer 639 | Learning PHP 640 | Constraint Programming 641 | Facebook Graph API 642 | Struts (for web apps) 643 | Delphi (programming language) 644 | mod_wsgi 645 | The Public Knowledge Workshop (NGO, Israel) - 646 | Node Version Management 647 | GPGPU 648 | Web Developers 649 | Web Components 650 | Programming Language Comparisons 651 | CodeGuard 652 | HTML5 653 | Containment (programming) 654 | Perl 6 655 | Bloc 656 | Jasmine (framework) 657 | Blueprint (CSS Framework) 658 | HTML Tags 659 | WaveMaker 660 | Learning MATLAB 661 | Programming Languages 662 | MAPI 663 | Memory Management Units 664 | Debuggers 665 | Windows Communication Foundation 666 | wxPython 667 | NotifyMyAndroid 668 | Padrino 669 | Visual FoxPro 670 | Apache Tapestry 671 | Plone 672 | CodeUnion 673 | phpMyAdmin 674 | Devise (Rails authentication Framework) 675 | Google Hosted Libraries 676 | Metaprogramming 677 | Python Web Frameworks 678 | PHP Libraries 679 | Servlets 680 | Message Queuing 681 | Famo.us (JavaScript Framework) 682 | Learning Visual Basic 683 | Apple FaceTime API 684 | Programming Bootcamps in New York 685 | AspectJ 686 | Interweb Systems 687 | The Coder Factory 688 | Python GIL 689 | STL (C++) 690 | VHDL 691 | AppleScript 692 | GitHub Gists 693 | Excel VBA 694 | Facebook API 695 | Study of Computer Programming 696 | Visual Programming 697 | MLton 698 | Front-End Web Development 699 | Bootcamps.in 700 | Semantic Annotation 701 | Django (web framework) 702 | CommonJS 703 | Xapian 704 | PySide 705 | Link Rot 706 | libc 707 | Apprentice.io 708 | Grails 709 | Cascading 710 | Algorithms for Competitive Programming 711 | ECMAScript Operators 712 | Standard ML 713 | AJAX 714 | Learning JavaScript 715 | OrientDB 716 | JSON Web Token 717 | CakePHP 718 | DerbyJS 719 | Learning to Build Websites 720 | Nilecode 721 | DataSift (product) 722 | Unicorn (Ruby gem) 723 | Concatenative Programming Languages 724 | Visual Impact Systems 725 | Scripting (programming) 726 | Email API 727 | Wakanda Server 728 | Django 1.3 729 | CSS Lint 730 | NLP API 731 | Core Java 732 | KML (File Format) 733 | Rails Rumble 734 | JUnit 735 | The Software Guild 736 | Github-Fi 737 | Twitter Cards 738 | Specific Programming Languages 739 | Cascading Style Sheets 740 | Scala 741 | scikit-learn 742 | Koding 743 | Hardware Description Languages (HDL) 744 | PostScript 745 | The Echo Nest 746 | DevBatch - Mobile Apps Development Company 747 | Haxe 748 | Learning Haskell 749 | Path API 750 | Compilers 751 | Ceylon (programming language) 752 | Spin Locks 753 | ooc (programming language) 754 | PyPy 755 | Tumult Hype 756 | Twilio Connect 757 | MQTT 758 | Studio Kudos 759 | Scrapinghub 760 | Windows Identity Foundation 761 | ECMAScript Versions 762 | Sphere Online Judge (SPOJ) 763 | Class (programming) 764 | WITSML 765 | SEO Beaver | Beaver Marketing Inc 766 | Inter-Process Communication 767 | Indirection 768 | Error Messages 769 | Application Programming Interfaces (API) 770 | Plotly 771 | eZ Flow 772 | Gemstone Object Database 773 | MagLev (Ruby interpreter) 774 | Gunicorn 775 | LiveNode 776 | JDBC 777 | Audio Tags 778 | NumPy 779 | Ruby vs. Python 780 | reStructuredText 781 | Node.js 782 | Domain-Specific Languages 783 | Mashape 784 | goto statement (programming) 785 | Xoops Engine 786 | Prototype-based Programming 787 | Type Inference 788 | Hour of Code 2014 789 | AlchemyAPI 790 | Plivo SMS API 791 | Programming Bootcamps in the San Francisco Bay Area 792 | Mako 793 | Haskell (programming language) 794 | Flot (JavaScript library) 795 | Programming Loops 796 | Guice 797 | Java Interview Questions 798 | GitHub for Mac 799 | Rascal 800 | Bobik Scraping Service 801 | Raphael-js 802 | Express (Node.js web framework) 803 | Sass (stylesheet language) 804 | Mediawiki API 805 | IDL (programming language) 806 | International Olympiad in Informatics (IOI) 807 | Python Imaging Library 808 | theLIFT 809 | CodeIgniter 810 | Twilio API 811 | Human-powered APIs 812 | Silex 813 | SWFObject 814 | Systems Programming 815 | ScraperWiki 816 | ACM-ICPC 2016 817 | Learning Specific Programming Languages 818 | Web Development 819 | CodeBenders 820 | Specific Problems in I 821 | React (JS Library) 822 | Brokly API 823 | Semantic Wiki 824 | Multilingual Websites 825 | C# (programming language) 826 | Learning R 827 | Simplify, Advance (company) 828 | QuickBASIC 829 | OOCSS 830 | SailsJS 831 | Alley Interactive 832 | Programming Interviews 833 | Python Libraries 834 | Regular Expressions in Programming Languages 835 | GDB 836 | Coroutines 837 | Functional Programming Languages 838 | PHP Classes 839 | Learning FORTRAN 840 | Plone Products 841 | TensorFlow (software library) 842 | Learning Node.js 843 | Learning Assembly 844 | Cake Development Corporation 845 | int (data type) 846 | Software Bugs 847 | Dev Bootcamp 848 | Semantic Web 849 | Lightstreamer 850 | Learning Python 851 | pip 852 | Funding of Github 853 | Extreme Programming (XP) 854 | CartoDB 855 | Semaphores 856 | CPython 857 | Is There an API for X? 858 | float (data type) 859 | Specific Problems in International Olympiad in Informatics 860 | HTML5 Boilerplate 861 | Common Lisp 862 | Decompilation 863 | Python Implementations 864 | Archetype Definition Language (ADL) 865 | Lcuma Labs (company) 866 | Streaming APIs 867 | Programming Paradigms 868 | Web Architecture 869 | MacRuby 870 | Laravel 871 | Server-Side JavaScript 872 | Active Admin 873 | XML Literals 874 | NDjango 875 | jQuery UI 876 | RSpec 877 | Java Native Interface 878 | SciPy 879 | Multiple Inheritance 880 | Perl (programming language) 881 | Enterprise Message Bus 882 | Java Message Service (JMS) 883 | Mobile Web Development 884 | Memory Leaks (computer programming) 885 | Symfony 886 | Apache Wicket 887 | Apache Hive 888 | Automatic Memoization 889 | Bazaar DVCS 890 | Twitter Firehose 891 | Worldmate 892 | Pyglet 893 | Imperative Programming 894 | Sockets 895 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/topic/topic.py: -------------------------------------------------------------------------------- 1 | import time 2 | import codecs 3 | import platform 4 | import sys 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as ec 8 | from selenium import webdriver 9 | from selenium.common.exceptions import NoSuchElementException 10 | 11 | 12 | class Topic(object): 13 | # Arguments passed through the batch file topic.bat 14 | email, passw, url = sys.argv[1:] 15 | 16 | # Opening PhantomJS webdriver 17 | options = ['--proxy-type=none'] 18 | if "Windows" == platform.system(): 19 | driver = webdriver.PhantomJS('..\phantomjs.exe', service_args=options) 20 | else: 21 | driver = webdriver.PhantomJS(executable_path='../phantomjs', 22 | service_args=options) 23 | wait = WebDriverWait(driver, 60) 24 | 25 | # Access to Quora and Login 26 | driver.get("http://www.quora.com/") 27 | driver.refresh() 28 | time.sleep(2) 29 | 30 | print ('Login to Quora..') 31 | while True: 32 | # Entering your username and password 33 | form = driver.find_element_by_class_name('login') 34 | 35 | username = form.find_element_by_name('email') 36 | username.send_keys(email) 37 | time.sleep(2) 38 | password = form.find_element_by_name('password') 39 | password.send_keys(passw) 40 | 41 | time.sleep(2) 42 | form.find_element_by_xpath( 43 | ".//input[contains(@value, 'Login')]").click() 44 | time.sleep(2) 45 | 46 | try: 47 | if driver.find_element_by_css_selector( 48 | 'div[id*="_error"]').is_displayed(): 49 | driver.refresh() 50 | print ('Login Error.Retry') 51 | email = raw_input("Insert username: ") 52 | passw = raw_input("Insert password: ") 53 | except NoSuchElementException: 54 | break 55 | 56 | # Open Section Organize of a Topic 57 | while True: 58 | try: 59 | driver.get(url) 60 | if driver.find_element_by_xpath( 61 | '//div[contains(@class, "TopicNavigationChildTree' + 62 | ' section_top")]').is_displayed(): 63 | break 64 | except Exception: 65 | print ('Error, page not avaible or wrong url') 66 | url = raw_input("Re-Insert URL-ORGANIZE_TOPIC:") 67 | 68 | filename = url.replace('https://www.quora.com/topic/', '') 69 | filename = filename.replace('/organize', '') 70 | filename += ".txt" 71 | target = codecs.open(filename, 'w+', encoding='utf-8') 72 | target.truncate() 73 | 74 | top = driver.find_element_by_xpath( 75 | '//div[contains(@class, "TopicNavigationChildTree section_top")]') 76 | topics = top.find_elements_by_xpath( 77 | './/span[contains(@class, "TopicNameSpan TopicName")]') 78 | show_more_list = top.find_elements_by_xpath( 79 | '//div[contains(@class, "TopicTreeItemToggled SimpleToggle Toggle")]' + 80 | '//small/span[not(contains(@class,"hidden"))]' + 81 | '/a[contains(text(), "Show ")]') 82 | 83 | # Expansion of the hierarchy of topics with Selenium 84 | while True: 85 | 86 | if len(show_more_list) > 0: 87 | 88 | for elem in show_more_list: 89 | driver.execute_script("arguments[0].scrollIntoView(true);", 90 | elem) 91 | driver.execute_script("window.scrollBy(0,-250);") 92 | time.sleep(0.5) 93 | 94 | # Click on "Show more" button 95 | webdriver.ActionChains(driver).move_to_element(elem).click( 96 | elem).perform() 97 | wait.until(ec.invisibility_of_element_located( 98 | (By.CLASS_NAME, 'loading'))) 99 | 100 | while len(topics) == len(top.find_elements_by_xpath( 101 | './/span[contains(@class, "TopicNameSpan TopicName")]')): 102 | time.sleep(1) 103 | time.sleep(2) 104 | 105 | print "Topic found: " + str(len(driver.find_elements_by_xpath( 106 | '//div[contains(@class, "TopicNavigationChildTree ' + 107 | 'section_top")]//span[contains(@class, ' + 108 | '"TopicNameSpan TopicName")]'))) 109 | 110 | show_more_list = top.find_elements_by_xpath( 111 | '//div[contains(@class, "TopicTreeItemToggled ' 112 | 'SimpleToggle Toggle")]//small/' + 113 | 'span[not(contains(@class,"hidden"))]' + 114 | '/a[contains(text(), "Show ")]') 115 | 116 | print "Other " + str(len(show_more_list)) + " to expand" 117 | else: 118 | break 119 | 120 | topics = top.find_elements_by_xpath( 121 | './/span[contains(@class, "TopicNameSpan TopicName")]') 122 | topics_text = [] 123 | 124 | print ('Please Wait..') 125 | for topic in topics: 126 | topics_text.append(topic.text.encode('ascii', 'ignore')) 127 | 128 | print ('Number of different Topic: ' + str(len(set(topics_text)))) 129 | 130 | print ('Writing on file the list of Topic..') 131 | for topic in set(topics_text): 132 | target.write(topic + '\n') 133 | 134 | print ('Finish') 135 | 136 | target.close() 137 | driver.close() 138 | -------------------------------------------------------------------------------- /quora/Project_Quora/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Project_Quora.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Project_Quora 12 | -------------------------------------------------------------------------------- /quora/README.md: -------------------------------------------------------------------------------- 1 | # Quora Scraper 2 | A python script for downloading questions and answers available on Quora and store in a database. 3 | Specifically is focused to extraction of questions and answers of Quora's topic. 4 | 5 | # How does it work 6 | In this project there are two different script: 7 | * `topic.py` 8 | The smaller part of the project, which allows the scraper to get the list of sub-topics in reference of a particular topic. 9 | In this way the scraper still remains into related topics, referring the starting quora topic. 10 | For example see the section Organize of topic [Computer Programming Organize](https://www.quora.com/topic/Computer-Programming/organize) and its hierarchy topic. 11 | * `quora.py` 12 | More consistent than the previous script. It allows the parsing of questions and answers always remaining in the related topics. 13 | It's based on Scrapy that makes requests for parsing question-threads and Selenium web driver framework for web automation mechanize. 14 | By combining these two frameworks it is possible to obtain a large number of questions and answers, useful to study and analyze the contents of Quora. 15 | 16 | # Installation 17 | 1. Download the content of this directory 18 | 2. Install all the requirements with: `pip install -r requirements.txt` 19 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip 20 | 4. Move `phantomjs.exe`(Windows) or `phantomjs`(OSX) into `spiders` directory 21 | 22 | # Getting Started 23 | 1. Start the first `topic.bat` louncher that takes like a parameter the url-organize of the topic. 24 | This louncher allows you to obtain, in a .txt file, a list of sub-topics about a certain topic. 25 | 2. Start the second `quora.bat` louncher that active scraping and allows to obtain a database and a json with all items. 26 | This louncher takes like a parameter the name of database in which to save the items extracted. 27 | 28 | Both script (topic.py and quora.py) to work need to be logged. Therefore be asked username and password of a Quora account when you execute one of the two previous louncher. 29 | 30 | # Notes 31 | In the `topic` directory of this project there is already a list of related topics of [Computer Programming](https://www.quora.com/topic/Computer-Programming) in a .txt file. 32 | So you may directly execute the `quora.bat`, to obtain a database of questions and answers related to Computer Programming Topic in Quora. 33 | As time passes, however, this list of sub-topic may be updated by Quora, so it would be useful to re-run `topic.bat` in the future. 34 | -------------------------------------------------------------------------------- /quora/quora.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | CLS 3 | 4 | IF "%~1"=="-h" GOTO Help 5 | 6 | :begin 7 | ECHO 1.Help 8 | ECHO 2.Insert Parameters 9 | ECHO. 10 | 11 | CHOICE /C 12 /M "Enter your choice:" 12 | 13 | :: Note - list ERRORLEVELS in decreasing order 14 | IF ERRORLEVEL 2 GOTO Param 15 | IF ERRORLEVEL 1 GOTO Help 16 | 17 | :Help 18 | ECHO List of Parameters: 19 | ECHO first:'EMAIL' related to Quora account 20 | ECHO second:'PASSWORD' related to Quora account 21 | ECHO third:'DB' choose a name for your database of items 22 | ECHO. 23 | GOTO begin 24 | 25 | :Param 26 | SET /P EMAIL=Enter EMAIL: 27 | SET /P PASSW=Enter PASSWORD: 28 | SET /P DB=Enter database: 29 | 30 | 31 | cd Project_Quora 32 | cd Project_Quora 33 | cd spiders 34 | scrapy crawl quora -a database=%DB% -a email=%EMAIL% -a password=%PASSW% 35 | pause 36 | 37 | -------------------------------------------------------------------------------- /quora/requirements.txt: -------------------------------------------------------------------------------- 1 | cffi==1.2.1 2 | characteristic==14.3.0 3 | cryptography==1.0.1 4 | cssselect==0.9.1 5 | enum34==1.0.4 6 | html2text==2015.6.21 7 | idna==2.0 8 | ipaddress==1.0.14 9 | parsedatetime==1.5 10 | pyasn1==0.1.8 11 | pyasn1-modules==0.0.7 12 | pycparser==2.14 13 | PyDbLite==3.0.2 14 | pyOpenSSL==0.15.1 15 | pytz==2015.6 16 | pywin32==219 17 | queuelib==1.4.2 18 | Scrapy==1.0.3 19 | selenium==2.47.3 20 | service-identity==14.0.0 21 | six==1.9.0 22 | Twisted==15.4.0 23 | virtualenv==13.1.2 24 | w3lib==1.12.0 25 | wheel==0.26.0 26 | zope.interface==4.1.2 27 | -------------------------------------------------------------------------------- /quora/topic.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | CLS 3 | 4 | IF "%~1"=="-h" GOTO Help 5 | 6 | :begin 7 | ECHO 1.Help 8 | ECHO 2.Insert Parameters 9 | ECHO. 10 | 11 | CHOICE /C 12 /M "Enter your choice:" 12 | 13 | 14 | :: Note - list ERRORLEVELS in decreasing order 15 | IF ERRORLEVEL 2 GOTO Param 16 | IF ERRORLEVEL 1 GOTO Help 17 | 18 | :Help 19 | ECHO List of Parameters: 20 | ECHO first:'EMAIL' related to Quora account 21 | ECHO second:'PASSWORD' related to Quora account 22 | ECHO third:'URL' url of Organize-Topic to obtain the list of topics (Example:https://www.quora.com/topic/Computer-Programming/organize) 23 | ECHO. 24 | GOTO begin 25 | 26 | :Param 27 | SET /P EMAIL=Enter EMAIL: 28 | SET /P PASSW=Enter PASSWORD: 29 | SET /P URL=Enter URL-ORGANIZE_TOPIC (Example:https://www.quora.com/topic/Computer-Programming/organize): 30 | 31 | cd Project_Quora 32 | cd Project_Quora 33 | cd spiders 34 | cd topic 35 | python topic.py %EMAIL% %PASSW% %URL% 36 | pause -------------------------------------------------------------------------------- /scn/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # SAP Community Network scraper 4 | -------- 5 | An implementation of a scraper that extracts items from each permissible discussion of SCN platform by scanning each page of ["ABAP Development"](http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]) category. 6 | 7 | Because of the problems caused to uploads of several contents, the software is subject to errors caused by loading page. 8 | Therefore it was thought to implement a mechanism for saving the state of execution, to retrieve it again from where it stopped. 9 | 10 | ### Version 11 | 2.0 12 | 13 | ### How does it work 14 | There is one main script that contains the core of the scraper: 15 | - `scraper.py` 16 | 17 | and there are 2 support script: 18 | - `main.py` 19 | - `dataStoring.py` 20 | 21 | ##### `scraper.py` 22 | It takes by input the `STARTURL` and using [Selenium](http://www.seleniumhq.org/) support, it run three phases: 23 | * verify that the content of the page (threads) have been loaded, otherwise it refresh the page until the content have been loaded; 24 | * takes the number of link that need to be considered, by escluding link of discussions that are not 25 | marked as 'answered' or 'not answered' and link of discussions that may raise problems; 26 | * For each discussion in the page, it extract all the questions and answers and it memorizes them in a structure; 27 | 28 | ##### `main.py` 29 | The program starts from this script that read from a file the `PAGE INDEX` to start the scraping process; 30 | in a first execution the program starts from page 2 and, for each page, update the index file with the current `PAGE INDEX`, 31 | in subsequent executions it load the `PAGE INDEX` from index file and starts from the last page. 32 | 33 | After loading the current state of execution, it defines the `STARTURL`, based on `PAGE INDEX`, to pass the scraper. 34 | After calling the scraper it save the threads extracted into a ".json" file and into a "pdl" ([PyDbLite](http://www.pydblite.net/en/)) file, and repeat the process. 35 | 36 | ##### `dataStoring.py` 37 | It provides mechanisms to store the data extracted into ".json" and "pdl" ([PyDbLite](http://www.pydblite.net/en/)) file without overwriting the existing content, 38 | and to read and update the "index.txt" file containing the `PAGE INDEX`. 39 | 40 | ### Installation 41 | 1. Download the content of this directory 42 | 2. Install all the requirements with: `pip install -r requirements.txt` 43 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip 44 | 4. Move `phantomjs.exe`(Windows) or `phantomjs`(OSX) into `scnscraper` directory 45 | 46 | ### Getting Started 47 | To start the software you need to execute `Run.bat` file, into the main directory. It provides 2 alternative of execution: 48 | 49 | * New Execution, to start a new scraping process or to start again the execution. You need to be waiting to not press this command after the data extraction, 50 | because it delete the output files that contains the thread exctracted. 51 | * Resume Execution, it resumes the execution from where it left off in the last run. 52 | 53 | ### Endnotes 54 | SAP Community Network have many problems from the point of view of scraping process. 55 | 56 | - It is a very slow platform in loading discussions from server db, that cause continuous loop of refresh until content loading; 57 | - It is a very wealthy site that contains a lot of scripts and content that slows the loading of web pages, 58 | causing not finding of content by selectors and then exceptions running. 59 | 60 | For this reason, the program ends often run with errors and it was thought to implement the saving and loading process, 61 | to start again the execution from the last step. For an optimal execution we suggest a good Internet speed. 62 | 63 | *... HAPPY SCRAPING!* 64 | -------------------------------------------------------------------------------- /scn/RUN.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | CLS 3 | 4 | IF "%~1"=="-h" GOTO Help 5 | 6 | :begin 7 | ECHO. 8 | ECHO. 9 | ECHO ---- SCN Scraper ---- 10 | ECHO. 11 | ECHO 1. NEW EXECUTION 12 | ECHO 2. RESUME EXECUTION 13 | ECHO 3. HELP 14 | ECHO. 15 | CHOICE /C 123 /M "Enter your choice: " 16 | 17 | :: Note - list ERRORLEVELS in decreasing order 18 | IF ERRORLEVEL 3 GOTO Help 19 | IF ERRORLEVEL 2 GOTO Resume 20 | IF ERRORLEVEL 1 GOTO New 21 | 22 | :Help 23 | ECHO. 24 | ECHO -- HELP -- 25 | ECHO. 26 | ECHO - If you would begin a new scraping process, press [1] 27 | ECHO. 28 | ECHO - If you want to delete the saved data of a previous execution beginning a new one, press [1] 29 | ECHO. 30 | ECHO - If you want to load a previous execution from the last page scraped, press [2]. 31 | ECHO. 32 | pause 33 | GOTO begin 34 | 35 | :Resume 36 | c:\python27\python.exe "%~dp0scnscraper\main.py" %* 37 | 38 | :New 39 | if exist "%~dp0scnscraper\abap.pydb" ( 40 | del "%~dp0scnscraper\abap.pydb" 41 | del "%~dp0scnscraper\abap.json" 42 | del "%~dp0scnscraper\index.txt" ) 43 | :: Edit index file with start URL PAGE 44 | c:\python27\python.exe "%~dp0scnscraper\main.py" %* 45 | PAUSE 46 | 47 | 48 | -------------------------------------------------------------------------------- /scn/discretizer/RUN.bat: -------------------------------------------------------------------------------- 1 | 2 | c:\python27\python.exe "%~dp0scn_discretizer.py" all 3 | pause 4 | -------------------------------------------------------------------------------- /scn/discretizer/discretization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatible with Python 2 and Python 3 3 | """ 4 | 5 | import csv 6 | import logging 7 | import os 8 | import re 9 | from math import log 10 | 11 | from dateutil.parser import parse as parse_date 12 | from nltk import FreqDist 13 | from nltk import PorterStemmer 14 | from nltk import tokenize 15 | from nltk import word_tokenize 16 | from pydblite.pydblite import Base 17 | 18 | 19 | class Discretizer: 20 | logging.basicConfig(level=logging.DEBUG) 21 | logger = logging.getLogger(__name__) 22 | linesep = '\n' 23 | 24 | def __init__(self, db_name, db_files): 25 | self.db_name = db_name 26 | self.db_files = db_files 27 | self.db = dict() 28 | 29 | def log(self, msg, level=logging.DEBUG): 30 | self.logger.log(level, msg) 31 | 32 | def load_db(self, check=True, fix=False, save_to_file=False): 33 | self.log('Opening {0} database(s)'.format(len(self.db_files)), logging.INFO) 34 | for db_name, db_file in self.db_files.items(): 35 | _db = Base(db_file, save_to_file=save_to_file) 36 | _db.open() 37 | self.log('Database {0} opened, records #: {1}'.format(db_name, len(_db)), logging.DEBUG) 38 | self.db.update({db_name: _db}) 39 | _db.create_index('uid') 40 | _db.create_index('type') 41 | self.log("Db {0}: printing simple strawman prediction accuracy for answers with max upvotes as best answer:".format(db_name), logging.INFO) 42 | self._strawman(_db) 43 | if check is True: 44 | self.check_db(fix) 45 | 46 | """ 47 | * fix answers_count with actual # of answers exported 48 | * if an answer has tag != N/A, the tags must be applied to the question in the same thread 49 | * if a question is marked as resolved True, then one of the answers in the thread must have been marked as solution; 50 | and viceversa; 51 | * check if Q or A text is '' 52 | * turn question uid from int to unicode string 53 | """ 54 | 55 | def check_db(self, fix=False): 56 | self.log('Checking consistency for databases.', logging.INFO) 57 | for name, _db in self.db.items(): 58 | for question in _db._type['Question']: 59 | expected_answers_count = int(question['answers']) 60 | actual_answers_count = 0 61 | for i in range(1, expected_answers_count + 1): 62 | try: 63 | _db._uid[question['uid'][:-1] + str(i)][0] 64 | actual_answers_count += 1 65 | except IndexError: 66 | break 67 | if actual_answers_count < expected_answers_count: 68 | self.log('Fixing answers count mismatch in thread id {0}, expected {1}, found {2}'. 69 | format(question['uid'], expected_answers_count, actual_answers_count)) 70 | _db.update(question, answers=actual_answers_count) 71 | 72 | for record in (_db('text') == ''): 73 | self.log('Warning on record {0} from db {1}: empty text!'.format(record['uid'], name), 74 | logging.WARNING) 75 | 76 | for record in (_db('type') == 'Answer') & (_db('tags') != 'null'): 77 | self.log('Warning on record {0} from db {1}: tags in answer!'.format(record['uid'], name), 78 | logging.WARNING) 79 | question_uid = record['uid'].split('.')[0] 80 | question = _db._uid[question_uid][0] 81 | question_tags = question['tags'] + '.' + record['tags'] 82 | _db.update(question, tags=question_tags) 83 | 84 | if fix is True: 85 | _db.commit() 86 | 87 | def load_threads(self): 88 | self.log('Loading threads from {0} db(s)'.format(len(self.db_files)), logging.INFO) 89 | overall_threads = list() 90 | for name, _db in self.db.items(): 91 | db_threads = list() 92 | questions = _db._type['Question'] # use db index 93 | self.log('Loaded {0} questions (threads) from db {1}, attaching answers...'.format(len(questions), name), 94 | logging.DEBUG) 95 | for question in questions: 96 | answers = self._get_answers(question['uid'], int(question['answers']), _db) 97 | db_threads.append({'question': question, 'question_uid': question['uid'], 98 | 'date_time': question['date_time'], 'answers_count': question['answers'], 99 | 'resolved': question['resolve'], 'tags': question['tags'], 'answers': answers}) 100 | 101 | overall_threads.extend(db_threads) 102 | self.log('Overall threads loaded: {0} from {1} database(s)'.format(len(overall_threads), len(self.db_files))) 103 | return overall_threads 104 | 105 | def _get_answers(self, question_id, answers_count, _db): 106 | self.log('Getting {0} answers for thread id {1}'.format(answers_count, question_id), logging.DEBUG) 107 | answers = list() 108 | if answers_count > 0: 109 | for i in range(1, answers_count + 1): 110 | answer_id = str(question_id)[:-1] + str(i) 111 | for answer in (_db._uid[answer_id]): # use index 112 | answers.append(answer) 113 | if answers_count != len(answers): 114 | self.log('Warning in thread id {0}: loaded {1} answers, expected {2}. Please, run a check db with ' 115 | 'fix=True'.format(question_id, len(answers), answers_count), 116 | logging.WARNING) 117 | return answers 118 | 119 | @staticmethod 120 | def _strawman(_db): 121 | # assumes index on uid already exists 122 | # db.create_index('uid') 123 | questions_with_answers = (_db("type") == 'Question') & (_db("answers") > 0) 124 | a = 0 125 | b = 0 126 | c = 0 127 | d = 0 128 | 129 | total_answer_count = 0 130 | for q in questions_with_answers: 131 | thread_answers = list() 132 | answers_count = int(q['answers']) 133 | total_answer_count += answers_count 134 | if answers_count > 0: 135 | for i in range(1, answers_count + 1): 136 | answer_id = q['uid'][:-1] + str(i) 137 | for answer in (_db._uid[answer_id]): # use index 138 | print(answer_id) 139 | thread_answers.append(answer) 140 | # compute upvotes 141 | max_upvote = 0 142 | for answer in thread_answers: 143 | if (answer['upvotes'] == '---'): 144 | count = 0 145 | else: 146 | count = int(answer['upvotes']) 147 | if count > max_upvote: 148 | max_upvote = count 149 | 150 | output = list() 151 | prediction = None 152 | for answer in thread_answers: 153 | if (answer['upvotes'] == '---'): 154 | count = 0 155 | else: 156 | count = int(answer['upvotes']) 157 | if count == max_upvote: 158 | prediction = 'solution' 159 | else: 160 | prediction = '' 161 | 162 | output.append((answer['uid'], answer['resolve'], prediction)) 163 | if prediction == 'solution' and answer['resolve'] == 'solution': 164 | a += 1 165 | if prediction == '' and answer['resolve'] == '---': 166 | b += 1 167 | if prediction == 'solution' and answer['resolve'] == '---': 168 | c += 1 169 | if prediction == '' and answer['resolve'] == 'solution': 170 | d += 1 171 | 172 | print(_db.name) 173 | print("a = {0} | b = {1}\nc = {2} | d = {3}".format(a, b, c, d)) 174 | print("Total answers %s" % total_answer_count) 175 | print("Accuracy {0}".format((float(a + b) / float(total_answer_count)))) 176 | 177 | def compute_features(self, threads, stemmed_vocabulary, distrib_matrix): 178 | self.log('Computing features. Please, wait. This will take some serious time...', logging.INFO) 179 | for thread in threads: 180 | self.log('Computing features for thread id {0}'.format(thread['question_uid']), logging.INFO) 181 | try: 182 | base_date = parse_date(thread['date_time']) 183 | except ValueError: 184 | base_date = parse_date('1970-01-01') 185 | except AttributeError: 186 | base_date = thread['date_time'] 187 | answers = thread['answers'] 188 | try: 189 | tag_list = thread['tags'].split('.') 190 | except AttributeError: 191 | tag_list = thread['tags'] # there is no '.' used as tag separator 192 | if '' in tag_list: 193 | tag_list.remove('') 194 | for answer in answers: 195 | # compute thread tags 196 | answer_tags = answer['tags'].split() 197 | if 'null' in answer_tags: 198 | answer_tags.remove('null') 199 | tag_list.extend(answer_tags) 200 | thread['tags'] = sorted(set(tag_list)) 201 | 202 | # compute len in chars and words 203 | alen = len(answer['text']) 204 | answer['len'] = alen 205 | wordcount = Discretizer._count_words(answer['text']) 206 | answer['wordcount'] = wordcount 207 | if wordcount == 0: 208 | answer['avg_chars_per_word'] = 0 209 | else: 210 | answer['avg_chars_per_word'] = "{0:.2f}".format(alen / float(wordcount)) # float with 2 decimals 211 | try: 212 | sentences = tokenize.sent_tokenize(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'), 213 | language='english') 214 | except (AttributeError, TypeError) as e: 215 | sentences = tokenize.sent_tokenize(str(answer['text']), language='english') 216 | sentence_count = len(sentences) 217 | answer['sentences'] = sentence_count 218 | if sentence_count == 0: 219 | words_per_sentence = 0 220 | else: 221 | words_per_sentence = "{0:.2f}".format(wordcount / float(sentence_count)) 222 | answer['avg_words_per_sentence'] = words_per_sentence 223 | longest_sentence = 0 224 | for s in sentences: 225 | l = Discretizer._count_words(s) 226 | if l > longest_sentence: 227 | longest_sentence = l 228 | answer['longest_sentence'] = longest_sentence 229 | try: 230 | creation_date = parse_date(answer['date_time']) 231 | except AttributeError: 232 | creation_date = answer['date_time'] 233 | except Exception: 234 | print('\nInvalid date_time') 235 | time_difference = abs((creation_date - base_date).total_seconds()) 236 | answer['time_difference'] = time_difference 237 | 238 | #answer['upvotes'] = thread['upvotes'] 239 | 240 | # check for urls and code snippets 241 | match = re.search(r'http(s)?://', str(answer['text']), re.MULTILINE) 242 | if match: 243 | answer['has_links'] = True 244 | else: 245 | answer['has_links'] = False 246 | 247 | answer['has_code_snippet'] = self._has_codesnippet(str(answer['text'])) 248 | try: 249 | LL = Discretizer._log_likelihood(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'), 250 | stemmed_vocabulary, distrib_matrix) 251 | except (AttributeError, TypeError) as e: 252 | LL = Discretizer._log_likelihood(str(answer['text']), stemmed_vocabulary, distrib_matrix) 253 | answer['loglikelihood'] = LL 254 | answer['loglikelihood_descending'] = LL 255 | answer['loglikelihood_ascending'] = LL 256 | try: 257 | aspw = Discretizer._ASPW(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace')) 258 | except (AttributeError, TypeError) as e: 259 | aspw = Discretizer._ASPW(str(answer['text'])) 260 | fk = Discretizer._FK(answer['avg_words_per_sentence'], aspw) 261 | answer['F-K'] = fk 262 | answer['F-K_descending'] = fk 263 | answer['F-K_ascending'] = fk 264 | 265 | # compute ranks 266 | #answers = Discretizer._sort_rank(answers, 'upvotes', reverse=True) 267 | answers = Discretizer._sort_rank(answers, 'sentences', reverse=True) 268 | answers = Discretizer._sort_rank(answers, 'len', reverse=True) 269 | answers = Discretizer._sort_rank(answers, 'views', reverse=True) 270 | answers = Discretizer._sort_rank(answers, 'wordcount', reverse=True) 271 | answers = Discretizer._sort_rank(answers, 'avg_chars_per_word', reverse=True) 272 | answers = Discretizer._sort_rank(answers, 'avg_words_per_sentence', reverse=True) 273 | answers = Discretizer._sort_rank(answers, 'longest_sentence', reverse=True) 274 | answers = Discretizer._sort_rank(answers, 'time_difference', reverse=False) 275 | answers = Discretizer._sort_rank(answers, 'loglikelihood_descending', reverse=True) 276 | answers = Discretizer._sort_rank(answers, 'loglikelihood_ascending', reverse=False) 277 | answers = Discretizer._sort_rank(answers, 'F-K_descending', reverse=True) 278 | answers = Discretizer._sort_rank(answers, 'F-K_ascending', reverse=False) 279 | thread['answers'] = answers 280 | 281 | self.log('Done computing features for {0} threads'.format(len(threads)), logging.INFO) 282 | return threads 283 | 284 | @staticmethod 285 | def _ASPW(text): 286 | aspw = 0 287 | for word in text.split(): 288 | s = Discretizer._count_syllables(word) 289 | aspw += s 290 | return aspw 291 | 292 | @staticmethod 293 | def _count_syllables(word): 294 | vowels = ['a', 'e', 'i', 'o', 'u', 'y'] 295 | currentWord = list(word) 296 | numVowels = 0 297 | lastWasVowel = False 298 | for wc in currentWord: 299 | foundVowel = False 300 | for v in vowels: 301 | # don't count diphthongs 302 | if (v == wc) and lastWasVowel is True: 303 | foundVowel = True 304 | lastWasVowel = True 305 | break 306 | elif (v == wc) and lastWasVowel is False: 307 | numVowels += 1 308 | foundVowel = True 309 | lastWasVowel = True 310 | break 311 | 312 | # If full cycle and no vowel found, set lastWasVowel to false; 313 | if not foundVowel: 314 | lastWasVowel = False 315 | 316 | # Remove es, it's _usually? silent 317 | if (len(word) > 2) and (word[len(word)-2:] == "es"): 318 | numVowels -= 1 319 | # remove silent e 320 | elif (len(word) > 1) and (word[len(word)-1:] == "e"): 321 | numVowels -= 1 322 | return numVowels 323 | 324 | @staticmethod 325 | def _FK(awps, asps): 326 | fk = (0.39 * float(awps)) + (11.8 * float(asps)) - 15.59 327 | return fk 328 | 329 | @staticmethod 330 | def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix): 331 | LL = 0 332 | if answer_text is not '': 333 | tokens = word_tokenize(str(answer_text), language='english') 334 | porter_stemmer = PorterStemmer() 335 | unique_wordcount = len(stemmed_vocabulary) 336 | """ 337 | per ogni w unica print_function words 338 | Cw = conta w in answer_text 339 | PwM = self.distrib_matrix[stemmer(w)] 340 | unique_wordcount = len(tokenize(answer_text) 341 | """ 342 | for w in tokens: 343 | _w = w.strip().lower() 344 | Cw = 0 345 | for _ in answer_text.split(): 346 | if _w == _.strip().lower(): 347 | Cw += 1 348 | 349 | try: 350 | w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace')) 351 | except AttributeError: 352 | w_stem = porter_stemmer.stem(_w) 353 | try: 354 | PwM = distrib_matrix[w_stem] 355 | except KeyError: # key error means frequency is equal to cutoff point 1 356 | PwM = 1 357 | LL += (Cw * log(float(PwM))) 358 | 359 | try: 360 | LL = "{0:.2f}".format(LL / float(unique_wordcount)) 361 | except ZeroDivisionError: 362 | LL = 0 363 | 364 | return LL 365 | 366 | @staticmethod 367 | def _count_words(text): 368 | wordcount = 0 369 | for word in text.split(): 370 | wordcount += 1 371 | return wordcount 372 | 373 | @staticmethod 374 | def _sort_rank(answers, key, reverse=True): 375 | try: 376 | new_list = sorted(answers, key=lambda x: float(x[key]), reverse=reverse) 377 | ranks = dict() 378 | for i in range(0, len(answers)): 379 | ranks[new_list[i]['uid']] = i + 1 380 | 381 | # fix rank ties 382 | for i in range(0, len(answers)-1): 383 | if new_list[i][key] == new_list[i+1][key]: 384 | ranks[new_list[i+1]['uid']] = ranks[new_list[i]['uid']] 385 | 386 | for k, v in ranks.items(): 387 | for a in answers: 388 | if a['uid'] == k: 389 | a['{0}_rank'.format(key)] = v 390 | except ValueError as e: 391 | logging.log(level=logging.ERROR, msg="Error computing rank for feature %s" % key) 392 | pass 393 | 394 | return answers 395 | 396 | def _has_codesnippet(self, text): 397 | code = False 398 | if re.search(r'({|}| package |\.jar| class | namespace |exception |<<| end | def |<\?php| soap | cutoff} 503 | return reduced 504 | 505 | def save_csv(self, threads): 506 | fout = '{0}_features.csv'.format(self.db_name) 507 | self.log('Saving features into {0}'.format(fout), logging.INFO) 508 | csvf = open(fout, 'wt') 509 | fields = ('resolved', 'question_uid', 'answers_count', 'answer_uid', 510 | 'date_time', 'time_difference', 'time_difference_rank', 'solution', 'len', 'len_rank', 'wordcount', 511 | 'wordcount_rank', 'avg_chars_per_word', 'avg_chars_per_word_rank', 'sentences', 'sentences_rank', 512 | 'avg_words_per_sentence', 'avg_words_per_sentence_rank', 'longest_sentence', 'longest_sentence_rank', 513 | 'views', 'views_rank', 'loglikelihood', 'loglikelihood_ascending_rank', 514 | 'loglikelihood_descending_rank', 'F-K', 'F-K_ascending_rank', 'F-K_descending_rank', 'upvotes', 515 | 'upvotes_rank', 'has_links', 'has_code_snippet', 'has_tags') 516 | writer = csv.DictWriter(csvf, dialect=csv.excel, fieldnames=fields, delimiter=',', lineterminator=self.linesep) 517 | writer.writeheader() 518 | # empty_line = dict.fromkeys(fields) 519 | for t in threads: 520 | row = dict() 521 | row.fromkeys(fields) 522 | answers = t['answers'] 523 | # question with no answers are excluded 524 | i = 0 525 | for a in answers: 526 | i += 1 527 | if i == 1: 528 | row['resolved'] = t['resolved'] 529 | row['question_uid'] = t['question_uid'] 530 | if len(t['tags']) > 0: 531 | row['has_tags'] = True 532 | else: 533 | row['has_tags'] = False 534 | else: 535 | row['resolved'] = '' 536 | row['question_uid'] = '' 537 | row['answers_count'] = t['answers_count'] 538 | row['answer_uid'] = a['uid'] 539 | row['time_difference'] = a['time_difference'] 540 | row['time_difference_rank'] = a['time_difference_rank'] 541 | if a['resolve'] == 'solution': 542 | row['solution'] = True 543 | else: 544 | row['solution'] = False 545 | row['len'] = a['len'] 546 | row['len_rank'] = a['len_rank'] 547 | row['wordcount'] = a['wordcount'] 548 | row['wordcount_rank'] = a['wordcount_rank'] 549 | row['avg_chars_per_word'] = a['avg_chars_per_word'] 550 | row['avg_chars_per_word_rank'] = a['avg_chars_per_word_rank'] 551 | row['sentences'] = a['sentences'] 552 | row['sentences_rank'] = a['sentences_rank'] 553 | row['avg_words_per_sentence'] = a['avg_words_per_sentence'] 554 | row['avg_words_per_sentence_rank'] = a['avg_words_per_sentence_rank'] 555 | row['longest_sentence'] = a['longest_sentence'] 556 | row['longest_sentence_rank'] = a['longest_sentence_rank'] 557 | row['views'] = a['views'] 558 | try: 559 | row['views_rank'] = a['views_rank'] 560 | except KeyError: 561 | pass 562 | row['loglikelihood'] = a['loglikelihood'] 563 | row['loglikelihood_descending_rank'] = a['loglikelihood_descending_rank'] 564 | row['loglikelihood_ascending_rank'] = a['loglikelihood_ascending_rank'] 565 | row['F-K'] = a['F-K'] 566 | row['F-K_descending_rank'] = a['F-K_descending_rank'] 567 | row['F-K_ascending_rank'] = a['F-K_ascending_rank'] 568 | row['upvotes'] = a['upvotes'] 569 | #row['upvotes_rank'] = a['upvotes_rank'] 570 | row['has_links'] = a['has_links'] 571 | row['has_code_snippet'] = a['has_code_snippet'] 572 | row['date_time'] = a['date_time'] 573 | writer.writerow(row) 574 | #writer.writerow(empty_line) 575 | csvf.close() 576 | -------------------------------------------------------------------------------- /scn/discretizer/scn_discretizer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from discretization import Discretizer 5 | 6 | 7 | def main(): 8 | program_name = os.path.basename(sys.argv[0]) 9 | db_files = {'abap': 'abap.pydb'} 10 | try: 11 | db_names = sys.argv[1] 12 | except IndexError: 13 | raise Exception('No db name. Please, re-run as "{0} dbname.pydb"'.format(program_name)) 14 | 15 | if db_names == 'all': 16 | discretizer = Discretizer(db_names, db_files) 17 | else: 18 | try: 19 | discretizer = Discretizer(db_names, {db_names: db_files.get(db_names)}) 20 | except KeyError: 21 | raise Exception('Invalid db name {0}. Please, check the name and re-run.'.format(db_names)) 22 | 23 | discretizer.load_db(check=False, fix=False, save_to_file=False) 24 | 25 | corpus = discretizer.build_corpus() 26 | stems = discretizer.build_stems(corpus) 27 | stemmed_vocabulary = discretizer.build_vocabulary(stems) 28 | distib_matrix = discretizer.build_distribution_matrix(stems) 29 | 30 | # grouping 31 | threads = discretizer.load_threads() 32 | # discretization and sorting 33 | threads = discretizer.compute_features(threads, stemmed_vocabulary, distib_matrix) 34 | discretizer.save_csv(threads) 35 | 36 | 37 | if __name__ == "__main__": 38 | sys.exit(main()) 39 | -------------------------------------------------------------------------------- /scn/requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.1 2 | cffi==1.2.1 3 | characteristic==14.3.0 4 | cryptography==1.0.2 5 | cssselect==0.9.1 6 | enum34==1.0.4 7 | html2text==2015.6.21 8 | idna==2.0 9 | ipaddress==1.0.14 10 | lxml==3.4.4 11 | pyasn1==0.1.9 12 | pyasn1-modules==0.0.8 13 | pycparser==2.14 14 | PyDbLite==3.0.2 15 | pyOpenSSL==0.15.1 16 | queuelib==1.4.2 17 | Scrapy==1.0.3 18 | selenium==2.48.0 19 | service-identity==14.0.0 20 | six==1.10.0 21 | Twisted==15.4.0 22 | w3lib==1.12.0 23 | wheel==0.24.0 24 | zope.interface==4.1.3 25 | -------------------------------------------------------------------------------- /scn/scnscraper/dataStoring.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Salvatore Cassano' 2 | 3 | from pydblite.pydblite import Base 4 | from items import SapItem 5 | import re 6 | import os.path 7 | 8 | class DataStoring(): 9 | 10 | #Inizialize an instantiated object by opening json file and the database 11 | def __init__(self): 12 | self.out_file = open("scnscraper/abap.json", "a") 13 | self.out_file.close() 14 | self.db = Base("scnscraper/abap.pydb") 15 | if self.db.exists(): 16 | self.db.open() 17 | else: 18 | self.db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 19 | 'views', 'answers', 'resolve', 'upvotes', 'text') 20 | 21 | #for each thread scraped, insert it into db 22 | def insert_items_into_db(self, threads): 23 | for thread in threads: 24 | item = SapItem() # New Item instance 25 | item = thread 26 | try: 27 | # Insert into db 28 | self.db.insert(url = str(item["url"]), uid = str(item["uid"]), type= str(item["type"] ), 29 | author=str(item["author"]), title = str(item["title"]), 30 | date_time = str(item["date_time"] ),tags = str(item["tags"] ), 31 | views = str(item["views"] ), answers = str(item["answers"] ), 32 | resolve = str(item["resolve"] ), upvotes = str(item["upvotes"] ), 33 | text = str(item["text"])) 34 | except UnicodeEncodeError: 35 | print("Unicode Encode Exception!") 36 | #save changes on disk 37 | self.db.commit() 38 | 39 | # for each thread scraped, initialize the string to insert into json file 40 | def threads_to_str(self, threads): 41 | out_string = "[ " 42 | if threads.__len__() == 0: 43 | return "" 44 | for thread in threads: 45 | item = SapItem() 46 | item = thread 47 | try: 48 | out_string += "{ url: '" + str(item["url"] ) + "', " + "uid: '" + str(item["uid"] ) + "', "\ 49 | "type: '" + str(item["type"] ) + "', "\ 50 | "author: '"+ str(item["author"]) + "', " \ 51 | "title: '"+ str(item["title"]) + "', "\ 52 | "date_time: '"+ str(item["date_time"] ) + "', " \ 53 | "tags: '"+ str(item["tags"] ) + "', " \ 54 | "views: '"+ str(item["views"] ) + "', "\ 55 | "answers: '"+ str(item["answers"] ) + "', " \ 56 | "resolve: '"+ str(item["resolve"] ) + "', " \ 57 | "upvotes: '"+ str(item["upvotes"] ) + "', "\ 58 | "text: '" + str(item["text"]) + "' }\n" 59 | except UnicodeEncodeError: 60 | print("Unicode Encode Exception!") 61 | 62 | out_string += " ]\n\n" 63 | return out_string 64 | 65 | 66 | #for each thread scraped, insert it into json file 67 | def insert_items_into_file(self, threads): 68 | try: 69 | self.out_file = open("scnscraper/abap.json", "a") # open in append mode 70 | #convert into string and insert into file 71 | self.out_file.write(self.threads_to_str(threads)) 72 | self.out_file.close() 73 | except: 74 | print('Exception in writing file') 75 | self.out_file.close() 76 | 77 | 78 | # read the web page index 79 | def read_index_from_file(self): 80 | if os.path.exists('scnscraper/index.txt'): 81 | with open('scnscraper/index.txt') as f: 82 | index = int(f.readline()) 83 | f.close() 84 | else: 85 | f = open('scnscraper/index.txt', 'w') 86 | index = 2 87 | f.write(str(index)) 88 | f.close() 89 | return index 90 | 91 | # Write the web page index 92 | def write_index_into_file(self, i): 93 | f = open('scnscraper/index.txt', 'w') 94 | f.write(str(i)) 95 | f.close() 96 | 97 | 98 | # Convert the content of json file into a new db 99 | def from_json_to_db(self): 100 | thread = '' 101 | db = Base("scnscraper/abap.pydb", save_to_file= True) 102 | # create new base with field names 103 | db.create('url', 'uid', 'type', 'author', 104 | 'title', 'date_time', 'tags', 'views', 105 | 'answers', 'resolve', 'upvotes', 'text', mode='override') 106 | i=0 107 | with open('scnsraper/threads.json', 'r') as file: 108 | for line in file: 109 | if(line.endswith(" }\n")): 110 | thread += line 111 | tokens = re.search(r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread)) 112 | if tokens is not None: 113 | db.insert(url = tokens.group(1), uid = tokens.group(2), type= tokens.group(3), 114 | author=tokens.group(4), title = tokens.group(5), date_time = tokens.group(6), 115 | tags = tokens.group(7), views = tokens.group(8), answers = tokens.group(9), 116 | resolve = tokens.group(10), upvotes = tokens.group(11), text = tokens.group(12)) 117 | db.commit() 118 | print ('\n--------------------------------------------\n') 119 | thread = '' 120 | if(line.startswith(" ]")): 121 | print("new page") 122 | thread = '' 123 | if(line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))): 124 | thread += line 125 | 126 | 127 | def state_extraction(): 128 | db = Base("scnscraper/abap.pydb") 129 | if db.exists(): 130 | db.open() 131 | record = db(type = "Question") 132 | print("# discussion scraped: " + str(record.__len__())) 133 | print("Answered: " + str(db(resolve = "Answered.").__len__())) 134 | print("Answered with solution: "+ str(db(resolve = "solution").__len__())) 135 | print("Not Answered: " + str(db(resolve = "Not Answered.").__len__())) 136 | print("Assumed Answered: " + str(db(resolve = "Assumed Answered.").__len__())) 137 | 138 | state_extraction = staticmethod(state_extraction) 139 | 140 | if __name__ == '__main__': 141 | DataStoring.state_extraction() 142 | -------------------------------------------------------------------------------- /scn/scnscraper/items.py: -------------------------------------------------------------------------------- 1 | 2 | from scrapy.item import Item, Field 3 | 4 | class SapItem(Item): 5 | uid = Field() # user id, unique and identifier for each post 6 | type = Field() # question, answer 7 | author = Field() 8 | title = Field() 9 | text = Field() 10 | date_time = Field() 11 | tags = Field() 12 | views = Field() 13 | answers = Field() # #answers 14 | resolve = Field() 15 | upvotes = Field() # likes 16 | url = Field() 17 | 18 | def __str__(self): 19 | return "Item(" + str(self['type']) + ") #" + str(self['uid']) 20 | -------------------------------------------------------------------------------- /scn/scnscraper/main.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Salvatore Cassano' 2 | 3 | from scraper import Scraper 4 | from dataStoring import DataStoring 5 | 6 | class MainApp(): 7 | 8 | 9 | if __name__ == '__main__': 10 | startUrl = "http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]&start=" 11 | storing = DataStoring() 12 | #read the input param 13 | i = storing.read_index_from_file() 14 | completeUrl = "" 15 | print("\n\n-------- SCRAPER STARTED ---\n") 16 | while (i<5000): 17 | #string concatenation to get the complete URL 18 | completeUrl = startUrl + str(20*i) 19 | #threads scraped from URL 20 | threads = [] 21 | print("------ SCRAPING NEW WEB PAGE (PAGE " + str(i) +") ---\n") 22 | SCNScraper = Scraper(completeUrl) 23 | #get threads 24 | threads = SCNScraper.scraping() 25 | #save content into json file 26 | storing.insert_items_into_file(threads) 27 | #save content into db 28 | storing.insert_items_into_db(threads) 29 | i = i+1 30 | #update index file 31 | storing.write_index_into_file(i) 32 | 33 | -------------------------------------------------------------------------------- /scn/scnscraper/scraper.py: -------------------------------------------------------------------------------- 1 | 2 | __author__ = 'Salvatore Cassano' 3 | 4 | import re 5 | from selenium import webdriver 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.common.exceptions import TimeoutException 9 | from selenium.webdriver.common.by import By 10 | import time 11 | from items import SapItem 12 | 13 | class Scraper(): 14 | 15 | #Inizialize an instantiated object setting Firefox as browser and setting the url 16 | def __init__(self, url): 17 | #self.driver = webdriver.Firefox() 18 | try: 19 | self.driver = webdriver.PhantomJS('scnscraper/phantomjs.exe') 20 | except: 21 | print('Please insert Phantomjs into directory and try again. PRESS ENTER TO CONTINUE...\n') 22 | raw_input() 23 | self.driver.get(url) 24 | 25 | 26 | def scraping(self): 27 | driver = self.driver 28 | delay = 100 # number of seconds 29 | linkOccurrences = 0 # number of link to scrape in the page 30 | start_url = str(driver.current_url) 31 | page_state = self.driver.execute_script('return document.readyState;') #wait until page is ready 32 | print("Loading page content...") 33 | while True: #repeat until content is loaded from the server db 34 | try: 35 | #find and click on previous button 36 | web_page = driver.find_element_by_class_name('j-pagination-prev') 37 | web_page.click() 38 | #wait until the loading is ultimated 39 | time.sleep(WebDriverWait(driver, delay).until_not( 40 | EC.presence_of_element_located((By.CLASS_NAME, 'j-loading-container')))) 41 | except TimeoutException: 42 | print "Loading took too much time!" 43 | #takes the number of link that need to be considered, it excludes link of discussions that are not marked as 44 | #answered or not answered and link of discussions that have ANONYMOUS user. 45 | linkOccurrences = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and " 46 | ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] " 47 | "or td[@class='j-td-icon' and " 48 | ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]" 49 | "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__() 50 | if (linkOccurrences!=0): 51 | break 52 | print("Content loaded with success!\n") 53 | index = 0 #link occurrences iterator 54 | print("--- Scraping threads from web page's link ---\n") 55 | items = [] #items scraped, initializing output 56 | while index < linkOccurrences: 57 | #check if the url have an error, then stop the program 58 | if 'http://scn.sap.com/community/abap/content?start=' in str(driver.current_url): 59 | print("--- ERROR IN PAGE LOADING ---") 60 | return 61 | #takes the reference of link that need to be scape, it excludes link of discussions that are not marked as 62 | #answered or not answered and link of discussions that have ANONYMOUS user. 63 | link = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and " 64 | ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] " 65 | "or td[@class='j-td-icon' and " 66 | ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]" 67 | "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a")[index] 68 | web_page = link.click() #click the link selected 69 | #wait until page is loaded 70 | WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'js-original-header'))) 71 | resolve = [] # says if element is answered or not answered 72 | url = str(driver.current_url) 73 | try: 74 | #select the element [answered, not answered, assumed answered] 75 | element = driver.find_element_by_xpath("//header[@class='js-original-header']//p/strong").text.encode('utf8') 76 | except: 77 | time.sleep(4) 78 | #sleep until element is completely loaded 79 | try: 80 | #repeat the selection 81 | element = driver.find_element_by_xpath("//header[@class='js-original-header']//p/strong").text.encode('utf8') 82 | except: 83 | print('Element not Found') 84 | element = "Not Answered." 85 | resolve.append(element) 86 | if(str(element).__eq__("Answered.")): 87 | #take the date of solution 88 | date = str(driver.find_element_by_xpath("//span[@class='font-color-meta j-line2']").text.encode('utf8')) 89 | solution_date = str(re.sub('by.*?on ', "", date)) 90 | try: 91 | #take the solution user 92 | solution_user = str(driver.find_element_by_xpath( 93 | "//span[@class='font-color-meta j-line2']/a").text.encode('utf8')) 94 | except: 95 | solution_user = 'ANONYMOUS' 96 | else: 97 | solution_date = "---" 98 | solution_user = "---" 99 | resolve.append(solution_date) 100 | resolve.append(solution_user) 101 | #select the number of post in a thread 102 | postOccurrences = driver.find_elements_by_xpath("//a[@class='jiveTT-hover-user jive-username-link']").__len__() 103 | i = 0 # number of occurrences iterator 104 | while i < postOccurrences: 105 | item = SapItem() # new Item instance 106 | try: 107 | # select the author in i position 108 | item["author"] = driver.find_elements_by_xpath("//a[@class='jiveTT-hover-user jive-username-link']")\ 109 | .pop(i).text.encode('utf8') 110 | except: 111 | item["author"] = 'ANONYMOUS' 112 | # select the url in i position 113 | item["url"] = url 114 | # generate the uid in i position 115 | item["uid"] = (str(url.replace("http://scn.sap.com/thread/", ""))) + "." + str(i+1) 116 | # select the title 117 | title = driver.find_element_by_xpath("//header[@class='js-original-header']//h1//a").text.encode('utf8') 118 | if(i==0): 119 | item["type"] = "Question" 120 | item["title"] = title 121 | else: 122 | item["type"] = "Answer" 123 | item["title"] = "re: " + title 124 | # select the text in i position 125 | if (str(element).__eq__("Answered.")) and (i>0): 126 | item["text"] = driver.find_elements_by_class_name("jive-rendered-content").pop(i+1).text.encode('utf8') 127 | else: 128 | item["text"] = driver.find_elements_by_class_name("jive-rendered-content").pop(i).text.encode('utf8') 129 | if (i==0): 130 | try: 131 | # select the date_time for question 132 | item["date_time"] = driver.find_elements_by_xpath("//span[@class='j-post-author']" 133 | ).pop(0).text.encode('utf8').split('\n', 1)[-1] 134 | except IndexError: 135 | #select and obtain the date_time from selector 136 | item["date_time"] = "" 137 | stringXpath = driver.find_elements_by_class_name('j-post-author ') 138 | date_extracted = stringXpath[i].text.encode('utf8') 139 | #regular expression to get from string selected the date_time 140 | list_of_re = re.findall('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (.*?) (AM|PM) ', 141 | str(date_extracted)) 142 | item["date_time"] = list_of_re.pop().__str__().replace("('", "").replace("', '", " ").replace("')", "") 143 | else: 144 | #select and obtain the date_time from selector 145 | item["date_time"] = "" 146 | stringXpath = driver.find_elements_by_class_name('j-post-author ') 147 | date_extracted = stringXpath[i].text.encode('utf8') 148 | try: 149 | #regular expression to get from string selected the date_time 150 | list_of_re = re.findall('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (.*?) (AM|PM) ', 151 | str(date_extracted)) 152 | item["date_time"] = list_of_re.pop().__str__().replace("('", "").replace("', '", " ").replace("')", "") 153 | except UnicodeEncodeError: 154 | item["date_time"] = date_extracted 155 | except IndexError: 156 | print("Index Exception") 157 | item["date_time"] = driver.find_elements_by_xpath("//span[@class='j-post-author']" 158 | ).pop(1).text.encode('utf8').split('\n', 1)[-1] 159 | if (i==0): 160 | # select the tags, if exists, for a question 161 | tags = driver.find_elements_by_class_name("jive-thread-post-details-tags") 162 | if len(tags) != 0: 163 | list_of_tags = [] 164 | for tags in tags: 165 | list_of_tags.append(tags.text.encode('utf8')) 166 | item["tags"] = list_of_tags 167 | else: 168 | item["tags"] = "null" 169 | else: 170 | item["tags"] = "null" 171 | if (i==0): 172 | # select the views for a question 173 | item["views"] = driver.find_elements_by_xpath("//span[@class='jive-content-footer-item']" 174 | ).pop(i).text.encode('utf8').replace(" Views", "") 175 | # select the answers for a question 176 | item["answers"] = postOccurrences-1 177 | # this attribute isn't available for answers, then it's set with a null value 178 | item["upvotes"] = "---" 179 | item["resolve"] = resolve[0] 180 | else: 181 | # this attribute isn't available for answers, then it's set with a null value 182 | item["views"] = 0 183 | # this attribute isn't available for answers, then it's set with a null value 184 | item["answers"] = "---" 185 | # select the upvotes for an answer 186 | item["upvotes"] = driver.find_element_by_class_name(" jive-acclaim-likedlink").text.encode('utf8') 187 | # check the resolve value 188 | if(str(resolve[0]).__eq__("Not Answered.")): 189 | # when discussion is Not Answered the solution not exists 190 | item["resolve"] = "---" 191 | else: 192 | # when the solution is Answered, check if the post i is solution by comparing 193 | # the author and the date_time with the author and the date of solution 194 | try: 195 | if (str(item["author"]).__eq__(resolve[2])) and (str(item["date_time"]).__eq__(resolve[1])): 196 | item["resolve"] = "solution" 197 | else: 198 | item["resolve"] = "---" 199 | except UnicodeEncodeError: 200 | item["resolve"] = "---" 201 | # append the thread scraped 202 | items.append(item) 203 | print("--- " + str(item) + " scraped ---") 204 | # go to the next link 205 | i=i+1 206 | # come back to the previous page (link's page) 207 | web_page = driver.back() 208 | # wait until the page element required is loaded 209 | WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'j-pagination-prev'))) 210 | while True: #repeat until content is loaded from the server db 211 | try: 212 | # find and click on previous button 213 | web_page = driver.find_element_by_class_name('j-pagination-prev') 214 | web_page.click() 215 | # wait until the loading is ultimated 216 | time.sleep(WebDriverWait(driver, delay).until_not(EC.presence_of_element_located 217 | ((By.CLASS_NAME, 'j-loading-container')))) 218 | except TimeoutException: 219 | print "Loading took too much time!" 220 | condition = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and " 221 | ".//img[@class = 'jive-icon-discussion-question jive-icon-med']]" 222 | " or td[@class='j-td-icon' and " 223 | ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]" 224 | "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__() 225 | # check is the loading is terminated with success, then go next 226 | if (condition!=0): 227 | break 228 | #repeat until content is loaded from the server db 229 | while True: 230 | try: 231 | # find and click on previous button 232 | web_page = driver.find_element_by_class_name('j-pagination-next') 233 | web_page.click() 234 | # wait until the loading is ultimated 235 | time.sleep(WebDriverWait(driver, delay).until_not(EC.presence_of_element_located 236 | ((By.CLASS_NAME, 'j-loading-container')))) 237 | except TimeoutException: 238 | print "Loading took too much time!" 239 | condition = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and " 240 | ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] " 241 | "or td[@class='j-td-icon' and " 242 | ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]" 243 | "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__() 244 | # check is the loading is terminated with success, then go next 245 | if (condition!=0): 246 | break 247 | # increment the links page iterator 248 | index = index + 1 249 | print("\n--- Threads scraped with success! ---") 250 | print("\n--- Going to another page... ---\n") 251 | #close the web page 252 | driver.close() 253 | return(items) 254 | -------------------------------------------------------------------------------- /yahoo-answers/README.md: -------------------------------------------------------------------------------- 1 |

Yahoo! Answer scraper

2 | -------- 3 |

This work provides web-scraping scripts developed in Python 2.7. They aims to extract Questions and Answers from "Programming & Design" category located in Yahoo! Answer website.

4 | 5 |

There are two main script:

6 | * yahoourlextractor 7 | * yahooscraper 8 | 9 |
yahoourlextractor
10 | Provide crawl mechanics in order to gain much as possible URLs related to Programming & Design Question Thread. 11 | This script use Selenium WebDriver in order to handle the "Infinite Scroll" present in the P&D homepage and Scrapy in order to scrape URL from other element available in Question Thread pages. 12 | All the URLs are stored in a PyDbLite database with info about the question insertion date, if are present. 13 | 14 |
yahooscraper
15 | This script use the first database provided by yahoourlextractor in order to start the scraping process of the questions and answers. 16 | Reading any URL in the database, he send Scrapy multiple requests. Every question and answer will be a Scrapy Item, with precise structure, and will be processed by Scrapy Pipeline in order to store Items in a new Database called QuestionExtracted.pdl. 17 | 18 |

Installation

19 | -------- 20 | 21 | 1. Download the content of this directory 22 | 2. Install all the requirements with: `pip install -r requirements.txt` 23 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip 24 | 4. Move phantomjs binary package into `yahoourlextractor/YahooUrlSearcher/spiders` 25 | 26 |

Start with the scripts

27 | --- 28 | 29 | 1. Start the first shell script `/yahoo-answer/yahoourlextractor.sh` in order to obtain URLs database called `URL_Database.pdl` 30 | 2. Move `URL_Database.pdl` or another database obtained by yahoourlextractor script in /yahoo-answer/yahooscraper/spiders 31 | 3. Start the second shell script `/yahoo-answer/yahooscraper.sh` this script need one arguments refered to the name of database URL. 32 | 33 | In the `yahoourlextractor/YahooUrlSearcher/spiders` you obtain the database containing the questions and answers scraped from Yahoo Answers. By default the name of this DB is `QuestionThreadExtracted.pdl`. The script also provide a .txt Log about amount of scraped data and JSON file for the Item stored in the DB. 34 | 35 |

Notes

36 | --- 37 | In the `yahoourlextractor/YahooUrlSearcher/spiders` dir are present an example URL database called `example_database.pdl`. So it's possibile run a test from command line using `cd /yahoo-answer/yahooscraper.sh` and `./yahooscraper.sh example_database.pdl` command. 38 | 39 | 40 | -------------------------------------------------------------------------------- /yahoo-answers/discretizer/discretizer.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Compatible with Python 2 and Python 3 4 | """ 5 | 6 | import csv 7 | import logging 8 | import os 9 | import re 10 | from math import log 11 | 12 | from dateutil.parser import parse as parse_date 13 | from nltk import FreqDist 14 | from nltk import PorterStemmer 15 | from nltk import tokenize 16 | from nltk import word_tokenize 17 | from pydblite.pydblite import Base 18 | 19 | 20 | class Discretizer: 21 | logging.basicConfig(level=logging.DEBUG) 22 | logger = logging.getLogger(__name__) 23 | linesep = '\n' 24 | 25 | def __init__(self, db_name, db_files): 26 | self.db_name = db_name 27 | self.db_files = db_files 28 | self.db = dict() 29 | 30 | def log(self, msg, level=logging.DEBUG): 31 | self.logger.log(level, msg) 32 | 33 | def load_db(self, check=True, fix=False, save_to_file=False): 34 | self.log('Opening {0} database(s)'.format(len(self.db_files)), logging.INFO) 35 | for db_name, db_file in self.db_files.items(): 36 | _db = Base(db_file, save_to_file=save_to_file) 37 | _db.open() 38 | self.log('Database {0} opened, records #: {1}'.format(db_name, len(_db)), logging.DEBUG) 39 | self.db.update({db_name: _db}) 40 | _db.create_index('uid') 41 | _db.create_index('type') 42 | if check is True: 43 | self.check_db(fix) 44 | 45 | """ 46 | * fix answers_count with actual # of answers exported 47 | * if an answer has tag != N/A, the tags must be applied to the question in the same thread 48 | * if a question is marked as resolved True, then one of the answers in the thread must have been marked as solution; 49 | and viceversa; 50 | * check if Q or A text is '' 51 | * turn question uid from int to unicode string 52 | """ 53 | 54 | def check_db(self, fix=False): 55 | self.log('Checking consistency for databases.', logging.INFO) 56 | for name, _db in self.db.items(): 57 | for question in _db._type['question']: 58 | expected_answers_count = int(question['answers']) 59 | actual_answers_count = 0 60 | for i in range(1, expected_answers_count + 1): 61 | try: 62 | _db._uid[str(question['uid']) + '.' + str(i)][0] 63 | actual_answers_count += 1 64 | except IndexError: 65 | break 66 | if actual_answers_count < expected_answers_count: 67 | self.log('Fixing answers count mismatch in thread id {0}, expected {1}, found {2}'. 68 | format(question['uid'], expected_answers_count, actual_answers_count)) 69 | _db.update(question, answers=actual_answers_count) 70 | 71 | for record in (_db('text') == ''): 72 | self.log('Warning on record {0} from db {1}: empty text!'.format(record['uid'], name), 73 | logging.WARNING) 74 | 75 | for record in (_db('type') == 'answer') & (_db('tags') != 'N/A'): 76 | self.log('Warning on record {0} from db {1}: tags in answer!'.format(record['uid'], name), 77 | logging.WARNING) 78 | question_uid = record['uid'].split('.')[0] 79 | question = _db._uid[question_uid][0] 80 | question_tags = question['tags'] + '.' + record['tags'] 81 | _db.update(question, tags=question_tags) 82 | 83 | if fix is True: 84 | _db.commit() 85 | 86 | def load_threads(self): 87 | self.log('Loading threads from {0} db(s)'.format(len(self.db_files)), logging.INFO) 88 | overall_threads = list() 89 | for name, _db in self.db.items(): 90 | db_threads = list() 91 | questions = _db._type['question'] # use db index 92 | self.log('Loaded {0} questions (threads) from db {1}, attaching answers...'.format(len(questions), name), 93 | logging.DEBUG) 94 | for question in questions: 95 | answers = self._get_answers(question['uid'], int(question['answers']), _db) 96 | db_threads.append({'question': question, 'question_uid': question['uid'], 97 | 'date_time': question['date_time'], 'answers_count': question['answers'], 98 | 'resolved': question['resolve'], 'tags': question['tags'], 'answers': answers}) 99 | 100 | overall_threads.extend(db_threads) 101 | self.log('Overall threads loaded: {0} from {1} database(s)'.format(len(overall_threads), len(self.db_files))) 102 | return overall_threads 103 | 104 | def _get_answers(self, question_id, answers_count, _db): 105 | self.log('Getting {0} answers for thread id {1}'.format(answers_count, question_id), logging.DEBUG) 106 | answers = list() 107 | if answers_count > 0: 108 | for i in range(1, answers_count + 1): 109 | answer_id = '{0}.{1}'.format(question_id, i) 110 | for answer in (_db._uid[answer_id]): # use index 111 | answers.append(answer) 112 | if answers_count != len(answers): 113 | self.log('Warning in thread id {0}: loaded {1} answers, expected {2}. Please, run a check db with ' 114 | 'fix=True'.format(question_id, len(answers), answers_count), 115 | logging.WARNING) 116 | return answers 117 | 118 | def compute_features(self, threads, stemmed_vocabulary, distrib_matrix): 119 | self.log('Computing features. Please, wait. This will take some serious time...', logging.INFO) 120 | for thread in threads: 121 | self.log('Computing features for thread id {0}'.format(thread['question_uid']), logging.INFO) 122 | try: 123 | base_date = parse_date(thread['date_time']) 124 | except ValueError: 125 | base_date = parse_date('1970-01-01') 126 | except AttributeError: 127 | base_date = thread['date_time'] 128 | answers = thread['answers'] 129 | tag_list = thread['tags'].split('.') 130 | if '' in tag_list: 131 | tag_list.remove('') 132 | for answer in answers: 133 | # compute thread tags 134 | answer_tags = answer['tags'].split() 135 | if 'N/A' in answer_tags: 136 | answer_tags.remove('N/A') 137 | tag_list.extend(answer_tags) 138 | thread['tags'] = sorted(set(tag_list)) 139 | 140 | # compute len in chars and words 141 | alen = len(answer['text']) 142 | answer['len'] = alen 143 | wordcount = Discretizer._count_words(answer['text']) 144 | answer['wordcount'] = wordcount 145 | if wordcount == 0: 146 | answer['avg_chars_per_word'] = 0 147 | else: 148 | answer['avg_chars_per_word'] = "{0:.2f}".format(alen / float(wordcount)) # float with 2 decimals 149 | try: 150 | sentences = tokenize.sent_tokenize(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'), 151 | language='english') 152 | except (AttributeError, TypeError) as e: 153 | sentences = tokenize.sent_tokenize(str(answer['text']), language='english') 154 | sentence_count = len(sentences) 155 | answer['sentences'] = sentence_count 156 | if sentence_count == 0: 157 | words_per_sentence = 0 158 | else: 159 | words_per_sentence = "{0:.2f}".format(wordcount / float(sentence_count)) 160 | answer['avg_words_per_sentence'] = words_per_sentence 161 | longest_sentence = 0 162 | for s in sentences: 163 | l = Discretizer._count_words(s) 164 | if l > longest_sentence: 165 | longest_sentence = l 166 | answer['longest_sentence'] = longest_sentence 167 | try: 168 | creation_date = parse_date(answer['date_time']) 169 | except AttributeError: 170 | creation_date = answer['date_time'] 171 | time_difference = abs((creation_date - base_date).total_seconds()) 172 | answer['time_difference'] = time_difference 173 | 174 | # TODO upvotes score 175 | 176 | # check for urls and code snippets 177 | match = re.search(r'http(s)?://', str(answer['text']), re.MULTILINE) 178 | if match: 179 | answer['has_links'] = True 180 | else: 181 | answer['has_links'] = False 182 | 183 | answer['has_code_snippet'] = self._has_codesnippet(str(answer['text'])) 184 | try: 185 | LL = Discretizer._log_likelihood(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'), 186 | stemmed_vocabulary, distrib_matrix) 187 | except (AttributeError, TypeError) as e: 188 | LL = Discretizer._log_likelihood(str(answer['text']), stemmed_vocabulary, distrib_matrix) 189 | answer['loglikelihood'] = LL 190 | answer['loglikelihood_descending'] = LL 191 | answer['loglikelihood_ascending'] = LL 192 | try: 193 | aspw = Discretizer._ASPW(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace')) 194 | except (AttributeError, TypeError) as e: 195 | aspw = Discretizer._ASPW(str(answer['text'])) 196 | fk = Discretizer._FK(answer['avg_words_per_sentence'], aspw) 197 | answer['F-K'] = fk 198 | answer['F-K_descending'] = fk 199 | answer['F-K_ascending'] = fk 200 | 201 | # compute ranks 202 | answers = Discretizer._sort_rank(answers, 'upvotes', reverse=True) 203 | answers = Discretizer._sort_rank(answers, 'sentences', reverse=True) 204 | answers = Discretizer._sort_rank(answers, 'len', reverse=True) 205 | answers = Discretizer._sort_rank(answers, 'views', reverse=True) 206 | answers = Discretizer._sort_rank(answers, 'wordcount', reverse=True) 207 | answers = Discretizer._sort_rank(answers, 'avg_chars_per_word', reverse=True) 208 | answers = Discretizer._sort_rank(answers, 'avg_words_per_sentence', reverse=True) 209 | answers = Discretizer._sort_rank(answers, 'longest_sentence', reverse=True) 210 | answers = Discretizer._sort_rank(answers, 'time_difference', reverse=False) 211 | answers = Discretizer._sort_rank(answers, 'loglikelihood_descending', reverse=True) 212 | answers = Discretizer._sort_rank(answers, 'loglikelihood_ascending', reverse=False) 213 | answers = Discretizer._sort_rank(answers, 'F-K_descending', reverse=True) 214 | answers = Discretizer._sort_rank(answers, 'F-K_ascending', reverse=False) 215 | thread['answers'] = answers 216 | 217 | self.log('Done computing features for {0} threads'.format(len(threads)), logging.INFO) 218 | return threads 219 | 220 | @staticmethod 221 | def _ASPW(text): 222 | aspw = 0 223 | for word in text.split(): 224 | s = Discretizer._count_syllables(word) 225 | aspw += s 226 | return aspw 227 | 228 | @staticmethod 229 | def _count_syllables(word): 230 | vowels = ['a', 'e', 'i', 'o', 'u', 'y'] 231 | currentWord = list(word) 232 | numVowels = 0 233 | lastWasVowel = False 234 | for wc in currentWord: 235 | foundVowel = False 236 | for v in vowels: 237 | # don't count diphthongs 238 | if (v == wc) and lastWasVowel is True: 239 | foundVowel = True 240 | lastWasVowel = True 241 | break 242 | elif (v == wc) and lastWasVowel is False: 243 | numVowels += 1 244 | foundVowel = True 245 | lastWasVowel = True 246 | break 247 | 248 | # If full cycle and no vowel found, set lastWasVowel to false; 249 | if not foundVowel: 250 | lastWasVowel = False 251 | 252 | # Remove es, it's _usually? silent 253 | if (len(word) > 2) and (word[len(word)-2:] == "es"): 254 | numVowels -= 1 255 | # remove silent e 256 | elif (len(word) > 1) and (word[len(word)-1:] == "e"): 257 | numVowels -= 1 258 | return numVowels 259 | 260 | @staticmethod 261 | def _FK(awps, asps): 262 | fk = (0.39 * float(awps)) + (11.8 * float(asps)) - 15.59 263 | return fk 264 | 265 | @staticmethod 266 | def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix): 267 | LL = 0 268 | if answer_text is not '': 269 | tokens = word_tokenize(str(answer_text), language='english') 270 | porter_stemmer = PorterStemmer() 271 | unique_wordcount = len(stemmed_vocabulary) 272 | """ 273 | per ogni w unica print_function words 274 | Cw = conta w in answer_text 275 | PwM = self.distrib_matrix[stemmer(w)] 276 | unique_wordcount = len(tokenize(answer_text) 277 | """ 278 | for w in tokens: 279 | _w = w.strip().lower() 280 | Cw = 0 281 | for _ in answer_text.split(): 282 | if _w == _.strip().lower(): 283 | Cw += 1 284 | 285 | try: 286 | w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace')) 287 | except AttributeError: 288 | w_stem = porter_stemmer.stem(_w) 289 | try: 290 | PwM = distrib_matrix[w_stem] 291 | except KeyError: # key error means frequency is equal to cutoff point 1 292 | PwM = 1 293 | LL += (Cw * log(float(PwM))) 294 | 295 | try: 296 | LL = "{0:.2f}".format(LL / float(unique_wordcount)) 297 | except ZeroDivisionError: 298 | LL = 0 299 | 300 | return LL 301 | 302 | @staticmethod 303 | def _count_words(text): 304 | wordcount = 0 305 | for word in text.split(): 306 | wordcount += 1 307 | return wordcount 308 | 309 | @staticmethod 310 | def _sort_rank(answers, key, reverse=True): 311 | new_list = sorted(answers, key=lambda x: float(x[key]), reverse=reverse) 312 | ranks = dict() 313 | for i in range(0, len(answers)): 314 | ranks[new_list[i]['uid']] = i + 1 315 | 316 | # fix rank ties 317 | for i in range(0, len(answers)-1): 318 | if new_list[i][key] == new_list[i+1][key]: 319 | ranks[new_list[i+1]['uid']] = ranks[new_list[i]['uid']] 320 | 321 | for k, v in ranks.items(): 322 | for a in answers: 323 | if a['uid'] == k: 324 | a['{0}_rank'.format(key)] = v 325 | return answers 326 | 327 | def _has_codesnippet(self, text): 328 | code = False 329 | if re.search(r'({|}| package |\.jar| class | namespace |exception |<<| end | def |<\?php| soap | cutoff} 434 | return reduced 435 | 436 | def save_csv(self, threads): 437 | fout = '{0}_features.csv'.format(self.db_name) 438 | self.log('Saving features into {0}'.format(fout), logging.INFO) 439 | csvf = open(fout, 'wt') 440 | fields = ('resolved', 'question_uid', 'answers_count', 'answer_uid', 441 | 'date_time', 'time_difference', 'time_difference_rank', 'solution', 'len', 'len_rank', 'wordcount', 442 | 'wordcount_rank', 'avg_chars_per_word', 'avg_chars_per_word_rank', 'sentences', 'sentences_rank', 443 | 'avg_words_per_sentence', 'avg_words_per_sentence_rank', 'longest_sentence', 'longest_sentence_rank', 444 | 'views', 'views_rank', 'loglikelihood', 'loglikelihood_ascending_rank', 445 | 'loglikelihood_descending_rank', 'F-K', 'F-K_ascending_rank', 'F-K_descending_rank', 'upvotes', 446 | 'upvotes_rank', 'has_links', 'has_code_snippet', 'has_tags') 447 | writer = csv.DictWriter(csvf, dialect=csv.excel, fieldnames=fields, delimiter=';', lineterminator=self.linesep) 448 | writer.writeheader() 449 | # empty_line = dict.fromkeys(fields) 450 | for t in threads: 451 | row = dict() 452 | row.fromkeys(fields) 453 | answers = t['answers'] 454 | # question with no answers are excluded 455 | i = 0 456 | for a in answers: 457 | i += 1 458 | if i == 1: 459 | row['resolved'] = t['resolved'] 460 | row['question_uid'] = t['question_uid'] 461 | if len(t['tags']) > 0: 462 | row['has_tags'] = True 463 | else: 464 | row['has_tags'] = False 465 | else: 466 | row['resolved'] = '' 467 | row['question_uid'] = '' 468 | row['answers_count'] = t['answers_count'] 469 | row['answer_uid'] = a['uid'] 470 | row['time_difference'] = a['time_difference'] 471 | row['time_difference_rank'] = a['time_difference_rank'] 472 | if a['resolve'] == 'solution': 473 | row['solution'] = True 474 | else: 475 | row['solution'] = False 476 | row['len'] = a['len'] 477 | row['len_rank'] = a['len_rank'] 478 | row['wordcount'] = a['wordcount'] 479 | row['wordcount_rank'] = a['wordcount_rank'] 480 | row['avg_chars_per_word'] = a['avg_chars_per_word'] 481 | row['avg_chars_per_word_rank'] = a['avg_chars_per_word_rank'] 482 | row['sentences'] = a['sentences'] 483 | row['sentences_rank'] = a['sentences_rank'] 484 | row['avg_words_per_sentence'] = a['avg_words_per_sentence'] 485 | row['avg_words_per_sentence_rank'] = a['avg_words_per_sentence_rank'] 486 | row['longest_sentence'] = a['longest_sentence'] 487 | row['longest_sentence_rank'] = a['longest_sentence_rank'] 488 | row['views'] = a['views'] 489 | row['views_rank'] = a['views_rank'] 490 | row['loglikelihood'] = a['loglikelihood'] 491 | row['loglikelihood_descending_rank'] = a['loglikelihood_descending_rank'] 492 | row['loglikelihood_ascending_rank'] = a['loglikelihood_ascending_rank'] 493 | row['F-K'] = a['F-K'] 494 | row['F-K_descending_rank'] = a['F-K_descending_rank'] 495 | row['F-K_ascending_rank'] = a['F-K_ascending_rank'] 496 | row['upvotes'] = a['upvotes'] 497 | row['upvotes_rank'] = a['upvotes_rank'] 498 | row['has_links'] = a['has_links'] 499 | row['has_code_snippet'] = a['has_code_snippet'] 500 | row['date_time'] = a['date_time'] 501 | writer.writerow(row) 502 | #writer.writerow(empty_line) 503 | csvf.close() -------------------------------------------------------------------------------- /yahoo-answers/discretizer/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from discretizer import Discretizer 5 | 6 | def main(): 7 | program_name = os.path.basename(sys.argv[0]) 8 | #Database name 9 | db_files = {'yahoo': 'no_date_database.pdl'} 10 | try: 11 | db_names = sys.argv[1] 12 | except IndexError: 13 | raise Exception('No db name. Please, re-run as {0} dbname.pdl'.format(program_name)) 14 | 15 | if db_names == 'all': 16 | discretizer = Discretizer(db_names, db_files) 17 | else: 18 | try: 19 | discretizer = Discretizer(db_names, {db_names: db_files.get(db_names)}) 20 | except KeyError: 21 | raise Exception('Invalid db name {0}. Please, check the name and re-run.'.format(db_names)) 22 | 23 | discretizer.load_db(check=False, fix=False, save_to_file=False) 24 | 25 | corpus = discretizer.build_corpus() 26 | stems = discretizer.build_stems(corpus) 27 | stemmed_vocabulary = discretizer.build_vocabulary(stems) 28 | distib_matrix = discretizer.build_distribution_matrix(stems) 29 | 30 | # grouping 31 | threads = discretizer.load_threads() 32 | # discretization and sorting 33 | threads = discretizer.compute_features(threads, stemmed_vocabulary, distib_matrix) 34 | discretizer.save_csv(threads) 35 | 36 | 37 | if __name__ == "__main__": 38 | sys.exit(main()) 39 | """db = Base('dotnet-v1.pydb', save_to_file=False) 40 | db.open() 41 | #recs = [r for r in db if r('type') == 'question' and r('answers') > 0] 42 | rec = (db("type") == 'question') & (db("answers") > 0) 43 | print len(rec)""" 44 | 45 | -------------------------------------------------------------------------------- /yahoo-answers/requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | backports.ssl-match-hostname==3.4.0.2 3 | certifi==2015.9.6.2 4 | cffi==1.2.1 5 | characteristic==14.3.0 6 | ChatterBot==0.2.5 7 | cryptography==1.0.1 8 | cssselect==0.9.1 9 | decorator==4.0.2 10 | enum34==1.0.4 11 | funcsigs==0.4 12 | functools32==3.2.3.post2 13 | fuzzywuzzy==0.6.2 14 | gnureadline==6.3.3 15 | html2text==2015.6.21 16 | idna==2.0 17 | ipaddress==1.0.14 18 | ipykernel==4.0.3 19 | ipython==4.0.0 20 | ipython-genutils==0.1.0 21 | ipywidgets==4.0.3 22 | Jinja2==2.8 23 | jsondatabase==0.0.6 24 | jsonschema==2.5.1 25 | jupyter==1.0.0 26 | jupyter-client==4.0.0 27 | jupyter-console==4.0.2 28 | jupyter-core==4.0.6 29 | lxml==3.5.0b1 30 | MarkupSafe==0.23 31 | matplotlib==1.4.3 32 | mechanize==0.2.5 33 | mistune==0.7.1 34 | mock==1.3.0 35 | nbconvert==4.0.0 36 | nbformat==4.0.0 37 | nltk==3.1 38 | nose==1.3.7 39 | notebook==4.0.5 40 | numpy==1.10.1 41 | oauthlib==1.0.3 42 | parsedatetime==1.5 43 | path.py==8.1.1 44 | pbr==1.8.1 45 | pexpect==3.3 46 | pickleshare==0.5 47 | ptyprocess==0.5 48 | pyasn1==0.1.8 49 | pyasn1-modules==0.0.7 50 | pycparser==2.14 51 | PyDbLite==3.0.2 52 | Pygments==2.0.2 53 | pyOpenSSL==0.15.1 54 | pyparsing==2.0.3 55 | python-dateutil==2.4.2 56 | pytz==2015.6 57 | pyzmq==14.7.0 58 | qtconsole==4.0.1 59 | queuelib==1.4.2 60 | requests==2.7.0 61 | requests-oauthlib==0.5.0 62 | Scrapy==1.0.3 63 | selenium==2.47.3 64 | service-identity==14.0.0 65 | simplegeneric==0.8.1 66 | six==1.9.0 67 | stripogram==1.5 68 | terminado==0.5 69 | tornado==4.2.1 70 | traitlets==4.0.0 71 | Twisted==15.4.0 72 | virtualenv==13.1.2 73 | w3lib==1.12.0 74 | wheel==0.24.0 75 | zope.interface==4.1.2 76 | -------------------------------------------------------------------------------- /yahoo-answers/yahooscraper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # sh yahooscraper.sh 3 | 4 | if [ -z "$1" ] 5 | then 6 | echo "ERROR you must enter one arg related to the Yahoo URL DB use -h for Help" 7 | else 8 | if [ "$1" = "-h" ] 9 | then 10 | echo "This script need the name of the database containing question URLs" 11 | echo "- sh yahooscraper.sh " 12 | else 13 | echo "Reading from $1 database " 14 | cd yahooscraper/yahooscraper/yahooscraper/spiders 15 | scrapy crawl yahoo -o question-and-answer-report.json -a database_name=$1 16 | fi 17 | fi -------------------------------------------------------------------------------- /yahoo-answers/yahooscraper/yahooscraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = yahooscraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = yahooscraper 12 | -------------------------------------------------------------------------------- /yahoo-answers/yahooscraper/yahooscraper/yahooscraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/__init__.py -------------------------------------------------------------------------------- /yahoo-answers/yahooscraper/yahooscraper/yahooscraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YahooItem(scrapy.Item): 12 | uid = scrapy.Field() 13 | type = scrapy.Field() 14 | author = scrapy.Field() 15 | title = scrapy.Field() 16 | text = scrapy.Field() 17 | date_time = scrapy.Field() 18 | tags = scrapy.Field() 19 | views = scrapy.Field() 20 | answers = scrapy.Field() 21 | resolve = scrapy.Field() 22 | upvotes = scrapy.Field() 23 | url = scrapy.Field() 24 | -------------------------------------------------------------------------------- /yahoo-answers/yahooscraper/yahooscraper/yahooscraper/pipelines.py: -------------------------------------------------------------------------------- 1 | from pydblite import Base 2 | from scrapy.xlib.pydispatch import dispatcher 3 | from scrapy import signals 4 | import codecs 5 | import datetime 6 | 7 | class DBPipeline(object): 8 | def __init__(self): 9 | 10 | #Creating log file 11 | filename = "session_log.txt" 12 | self.log_target = codecs.open(filename, 'a+', encoding='utf-8') 13 | self.log_target.truncate() 14 | self.log_target.write("***New session started at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + " ***" +"\n") 15 | 16 | #Creating database for items 17 | self.db = Base('QuestionThreadExtracted.pdl') 18 | self.db.create('uid', 'type', 'author', 'title', 'text', 'date_time', 19 | 'tags', 'views', 'answers', 'resolve', 'upvotes', 'url', mode="open") 20 | 21 | #Some data for the log file 22 | self.number_of_questions = 0 23 | self.number_of_answers = 0 24 | self.last_id=0 25 | dispatcher.connect(self.spider_closed, signals.spider_closed) 26 | 27 | 28 | def process_item(self, item, spider): 29 | 30 | self.db.insert(uid=item['uid'], 31 | type=item['type'], 32 | author=item['author'], 33 | title=item['title'], 34 | text=item['text'], 35 | date_time=item['date_time'], 36 | tags=item['tags'], 37 | views=item['views'], 38 | answers=item['answers'], 39 | resolve=item['resolve'], 40 | upvotes=item['upvotes'], 41 | url=item['url'] 42 | ) 43 | #Count questions and answers 44 | if "question" in item['type']: 45 | self.number_of_questions+=1 46 | if self.last_id