├── .gitignore ├── LICENSE ├── README.md ├── quora ├── Project_Quora │ ├── Project_Quora │ │ ├── __init__.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── quora.py │ │ │ └── topic │ │ │ ├── Computer-Programming.txt │ │ │ └── topic.py │ └── scrapy.cfg ├── README.md ├── quora.bat ├── requirements.txt └── topic.bat ├── scn ├── README.md ├── RUN.bat ├── discretizer │ ├── RUN.bat │ ├── discretization.py │ └── scn_discretizer.py ├── requirements.txt └── scnscraper │ ├── dataStoring.py │ ├── items.py │ ├── main.py │ └── scraper.py └── yahoo-answers ├── README.md ├── discretizer ├── discretizer.py └── main.py ├── requirements.txt ├── yahooscraper.sh ├── yahooscraper └── yahooscraper │ ├── scrapy.cfg │ └── yahooscraper │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── YahooScraper.py │ ├── __init__.py │ └── example_database.pdl ├── yahoourlextractor.sh └── yahoourlextractor ├── YahooUrlSearcher ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── yahoourlspider.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | .idea/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | .idea/ 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Collaborative Development Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # qa-scrapers 2 | 3 | A collection of Python scripts that leverage Selenium and/or Scrapy to scrape content from Question Answering sites, such as and other than Stack Overflow. 4 | 5 | So far, three scrapers are available: 6 | * [Yahoo! Answers](yahoo-answers/README.md) 7 | * [Quora](quora/README.md) 8 | * [SAP Community Network](scn/README.md) 9 | 10 | Please, refer to the README.md files within each subfolder for more. 11 | 12 | ## Fair use policy 13 | Pleace, cite the following paper if you decide to use these scripts for your own research purposes. 14 | 15 | > F. Calefato, F. Lanubile, N. Novielli. “[Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums](http://collab.di.uniba.it/fabio/wp-content/uploads/sites/5/2014/05/a13-calefato.pdf).” In *Proc. 10th Int’l Symposium on Empirical Softw. Eng. and Measurement (ESEM’16)*, Ciudad Real, Spain, Sept. 8-9, 2016, DOI:[10.1145/2961111.2962585](http://doi.acm.org/10.1145/2961111.2962585). 16 | 17 | ```latex 18 | @inproceedings{calefato_2016_esem, 19 | author = {Calefato, Fabio and Lanubile, Filippo and Novielli, Nicole}, 20 | title = {Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums}, 21 | booktitle = {Proc. of the 10th ACM/IEEE Int'l Symposium on Empirical Software Engineering and Measurement}, 22 | series = {ESEM '16}, 23 | year = {2016}, 24 | isbn = {978-1-4503-4427-2}, 25 | location = {Ciudad Real, Spain}, 26 | pages = {13:1--13:10}, 27 | articleno = {13}, 28 | numpages = {10}, 29 | url = {http://doi.acm.org/10.1145/2961111.2962585}, 30 | doi = {10.1145/2961111.2962585}, 31 | acmid = {2962585}, 32 | publisher = {ACM}, 33 | address = {New York, NY, USA}, 34 | keywords = {Best-answer prediction, Developer forums, Q\&A sites, Stack Overflow}, 35 | } 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/quora/Project_Quora/Project_Quora/__init__.py -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ProjectQuoraItem(scrapy.Item): 12 | uid = scrapy.Field() # Id of a question (e.g 1), Id of an answer (e.g 1.1) 13 | type = scrapy.Field() # question, answer 14 | author = scrapy.Field() # author of a question or an answer 15 | title = scrapy.Field() # title of a question, null for an answer 16 | text = scrapy.Field() # text of a question or an answer 17 | date_time = scrapy.Field() # when a question or an answer was written 18 | tags = scrapy.Field() # topics associated to the question, null for answer 19 | views = scrapy.Field() # views of a questions or an answer 20 | answers = scrapy.Field() # number of answers for a question, 0 for answers 21 | resolve = scrapy.Field() # always null 22 | upvotes = scrapy.Field() # likes for a question (null) or an answers 23 | url = scrapy.Field() # url of a question or an answer 24 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from pydblite import Base 9 | import os 10 | import json 11 | from scrapy.xlib.pydispatch import dispatcher 12 | from scrapy import signals 13 | 14 | 15 | class DBPipeline(object): 16 | # Pipeline to write an Item in the database 17 | def open_spider(self, spider): 18 | # Creation of DB 19 | self.db = Base(spider.database) 20 | self.db.create('uid', 'type', 'author', 'title', 'text', 'date_time', 21 | 'tags', 'views', 'answers', 'resolve', 'upvotes', 'url', 22 | mode="override") 23 | dispatcher.connect(self.spider_closed, signals.spider_closed) 24 | 25 | def process_item(self, item, spider): 26 | # Writing of the item 27 | self.db.insert(uid=item['uid'], 28 | type=item['type'], 29 | author=item['author'], 30 | title=item['title'], 31 | text=item['text'], 32 | date_time=item['date_time'], 33 | tags=item['tags'], 34 | views=item['views'], 35 | answers=item['answers'], 36 | resolve=item['resolve'], 37 | upvotes=item['upvotes'], 38 | url=item['url'] 39 | ) 40 | 41 | self.db.commit() 42 | return item 43 | 44 | def spider_closed(self, spider): 45 | # Number of items saved, shown at the end 46 | i = 0 47 | j = 0 48 | for r in self.db: 49 | 50 | if r["type"] == "question": 51 | i += 1 52 | else: 53 | j += 1 54 | 55 | print ('Number of questions and answers found:') 56 | print (str(i) + ' questions \n') 57 | print (str(j) + ' answers \n') 58 | 59 | 60 | class JsonWriterPipeline(object): 61 | # Pipeline to write an Item in Json File 62 | def __init__(self): 63 | if os.path.exists('items.json'): 64 | os.remove('items.json') 65 | 66 | self.file = open('items.json', 'wb') 67 | dispatcher.connect(self.spider_closed, signals.spider_closed) 68 | 69 | def process_item(self, item, spider): 70 | line = json.dumps(dict(item)) + "\n" 71 | self.file.write(line) 72 | return item 73 | 74 | def spider_closed(self, spider): 75 | self.file.close() 76 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Project_Quora project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Project_Quora' 13 | 14 | SPIDER_MODULES = ['Project_Quora.spiders'] 15 | NEWSPIDER_MODULE = 'Project_Quora.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Project_Quora (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | CONCURRENT_REQUESTS=1 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | DOWNLOAD_DELAY=0.5 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | COOKIES_ENABLED=True 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'Project_Quora.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'Project_Quora.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | ITEM_PIPELINES = { 65 | 'Project_Quora.pipelines.DBPipeline': 300, 66 | 'Project_Quora.pipelines.JsonWriterPipeline': 800, 67 | } 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 71 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 72 | #AUTOTHROTTLE_ENABLED=True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY=5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY=60 77 | # Enable showing throttling stats for every response received: 78 | #AUTOTHROTTLE_DEBUG=False 79 | 80 | # Enable and configure HTTP caching (disabled by default) 81 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 82 | #HTTPCACHE_ENABLED=True 83 | #HTTPCACHE_EXPIRATION_SECS=0 84 | #HTTPCACHE_DIR='httpcache' 85 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 86 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 87 | 88 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /quora/Project_Quora/Project_Quora/spiders/quora.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import time 3 | import platform 4 | import scrapy 5 | import glob 6 | import html2text 7 | import parsedatetime as pdt 8 | from selenium.webdriver import DesiredCapabilities 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as ec 12 | from selenium import webdriver 13 | from selenium.common.exceptions import NoSuchElementException 14 | from selenium.common.exceptions import TimeoutException 15 | import codecs 16 | from ..items import ProjectQuoraItem 17 | import re 18 | from scrapy import signals 19 | from scrapy.xlib.pydispatch import dispatcher 20 | 21 | 22 | class QuoraSpider(scrapy.Spider): 23 | name = "quora" # Name of Spider 24 | allowed_domains = ["quora.com"] 25 | uid = 0 # Id of question-thread 26 | list_topic = [] 27 | database = '' 28 | 29 | # Creation of the list of topics 30 | if "Windows" == platform.system(): 31 | list_of_files = glob.glob('Topic/*.txt') 32 | else: 33 | list_of_files = glob.glob('Topic\*.txt') 34 | 35 | for filename in list_of_files: 36 | lines = open(filename, "r").readlines() 37 | for line in lines: 38 | list_topic.append("<" + line.rstrip('\n') + ">") 39 | 40 | list_topic = set(list_topic) 41 | 42 | def __init__(self, *args, **kwargs): 43 | super(QuoraSpider, self).__init__(*args, **kwargs) 44 | # Arguments passed through the batch file quora.bat 45 | self.database = kwargs.get('database') + '.pdl' 46 | email = kwargs.get('email') 47 | passw = kwargs.get('password') 48 | 49 | # When Spider quits will call the function spider_closed() 50 | dispatcher.connect(self.spider_closed, signals.spider_closed) 51 | 52 | # Opening PhantomJS webdriver with certain settings 53 | options = ['--proxy-type=none', '--load-images=false'] 54 | if platform.system() == "Windows": 55 | self.driver = webdriver.PhantomJS(service_args=options) 56 | else: 57 | self.driver = webdriver.PhantomJS(executable_path='./phantomjs', 58 | service_args=options) 59 | self.driver.set_window_size(1920, 1080) 60 | self.wait = WebDriverWait(self.driver, 60) 61 | 62 | # Access to Quora and Login 63 | self.driver.get("http://www.quora.com/") 64 | self.driver.refresh() 65 | time.sleep(2) 66 | 67 | print ('Login to Quora..') 68 | while True: 69 | # Entering your username and password 70 | form = self.driver.find_element_by_class_name('login') 71 | 72 | username = form.find_element_by_name('email') 73 | username.send_keys(email) 74 | time.sleep(2) 75 | password = form.find_element_by_name('password') 76 | password.send_keys(passw) 77 | 78 | time.sleep(2) 79 | form.find_element_by_xpath( 80 | ".//input[contains(@value, 'Login')]").click() 81 | time.sleep(2) 82 | 83 | try: 84 | if self.driver.find_element_by_css_selector( 85 | 'div[id*="_error"]').is_displayed(): 86 | self.driver.refresh() 87 | print ('Login Error.Retry') 88 | email = raw_input("Insert username: ") 89 | passw = raw_input("Insert password: ") 90 | except NoSuchElementException: 91 | break 92 | 93 | def start_requests(self): 94 | # Request for parsing the '/all-questions' section of a topic 95 | 96 | for filename in self.list_of_files: 97 | filename = filename.replace('\\', '') 98 | filename = filename.replace('/', '') 99 | filename = filename.replace('Topic', '') 100 | filename = filename.replace('.txt', '') 101 | yield scrapy.Request('https://www.quora.com/topic/' + 102 | filename + '/all_questions', self.parse) 103 | 104 | def spider_closed(self, spider): 105 | self.driver.close() 106 | 107 | def parse(self, response): 108 | # Opening the '/all-questions' section of a topic 109 | self.driver.get(response.url) 110 | 111 | old_position = self.driver.execute_script( 112 | "return document.body.scrollHeight") 113 | 114 | # Scroll-down with with Selenium 115 | while True: 116 | self.driver.execute_script( 117 | "window.scrollTo(0, document.body.scrollHeight);") 118 | 119 | # Visibility of feedback at the bottom of the page after the scroll 120 | # Wait until is visible 121 | if self.driver.find_element_by_xpath( 122 | '//div[contains(@class,"pager_next")]').is_displayed(): 123 | try: 124 | self.wait.until(ec.invisibility_of_element_located( 125 | (By.CLASS_NAME, "pager_next"))) 126 | except TimeoutException: 127 | self.driver.refresh() 128 | 129 | time.sleep(1) 130 | new_pos = self.driver.execute_script( 131 | "return document.body.scrollHeight") 132 | 133 | # Check the size of the page 134 | # If the dimensions are the same, stop the scroll-down 135 | if new_pos == old_position: 136 | sleep = 0 137 | self.driver.execute_script( 138 | "$('html,body').animate({scrollTop: 0}, 2000);") 139 | time.sleep(randint(4, 9)) 140 | 141 | while self.driver.execute_script( 142 | "return document.body.scrollHeight") == old_position \ 143 | and sleep != 100: 144 | self.driver.execute_script( 145 | "window.scrollTo(0, document.body.scrollHeight);") 146 | time.sleep(1) 147 | sleep += 1 148 | 149 | if sleep == 100: 150 | break 151 | 152 | old_position = self.driver.execute_script( 153 | "return document.body.scrollHeight") 154 | post_elems = self.driver.find_elements_by_class_name( 155 | "pagedlist_item") 156 | print ('Question found: ' + str(len(post_elems))) 157 | 158 | # Extraction of urls questions with selectors 159 | post_elems = self.driver.find_elements_by_class_name("pagedlist_item") 160 | url_list = [] 161 | for post in post_elems: 162 | url_list.append(post.find_element_by_xpath( 163 | './/a[contains(@class,"question_link")]') 164 | .get_attribute('href')) 165 | url_list = set(url_list) 166 | 167 | # Request for parsing the question-thread 168 | for url in url_list: 169 | url_scrapy = response.urljoin(url) 170 | yield scrapy.Request(url_scrapy, callback=self.parse_question) 171 | 172 | def parse_question(self, response): 173 | # Creation of the list of tags of the question 174 | tag_string = "" 175 | tags = response.xpath('//div[contains(@class,' + 176 | '"QuestionTopicHorizontalList TopicList")]' + 177 | '//span[contains(@class,' + 178 | ' "TopicNameSpan TopicName")]/text()').extract() 179 | for tag in tags: 180 | tag_string = tag_string + "<" + tag.encode('utf8') + "> " 181 | 182 | found = False 183 | for topic in self.list_topic: 184 | if topic in tag_string: 185 | found = True 186 | break 187 | ''' 188 | The question will be scanned if it has at least one topic in list_topic 189 | ''' 190 | if found: 191 | # Related questions 192 | url_related = response.xpath('//li[contains(@class,' + 193 | '"related_question")]' + 194 | '//a[contains(@class, ' + 195 | '"question_link")]/@href').extract() 196 | # Request for parsing the related question-threads 197 | for url in url_related: 198 | url_scrapy = response.urljoin(url) 199 | yield scrapy.Request(url_scrapy, callback=self.parse_question) 200 | 201 | # Page loading of question-thread 202 | self.driver.get(response.url) 203 | right_content = self.driver. \ 204 | find_element_by_xpath('//div[contains(@class,' + 205 | '"HighlightsSection SimpleToggle ' + 206 | 'Toggle")]') 207 | # Show the content of a Rigth Side bar 208 | try: 209 | if right_content.find_element_by_xpath( 210 | './/span/a[contains(@class,"expand_link")]') \ 211 | .is_displayed(): 212 | 213 | more_btn = right_content.find_element_by_xpath( 214 | './/span/a[contains(@class,"expand_link")]') 215 | 216 | while True: 217 | try: 218 | self.wait.until(ec.element_to_be_clickable( 219 | (By.XPATH, 220 | '//span/a[contains(@class,"expand_link")]'))) 221 | break 222 | except TimeoutException: 223 | self.driver.refresh() 224 | 225 | webdriver.ActionChains(self.driver).move_to_element( 226 | more_btn).click(more_btn).perform() 227 | 228 | self.wait.until(ec.invisibility_of_element_located( 229 | (By.XPATH, more_btn))) 230 | time.sleep(1) 231 | 232 | right_content = self.driver.find_element_by_xpath( 233 | '//div[contains(@class,' + 234 | '"QuestionPageRightLoggedInSidebar")]') 235 | right_content = right_content.find_element_by_css_selector( 236 | 'div[id*="_expanded"]') 237 | 238 | except NoSuchElementException: 239 | right_content = self.driver.find_element_by_xpath( 240 | '//div[contains(@class,' + 241 | '"QuestionPageRightLoggedInSidebar")]') 242 | right_content = right_content.find_element_by_css_selector( 243 | 'div[id*="__truncated"]') 244 | 245 | # Set the properties of Html2text 246 | item_list = [] 247 | h = html2text.HTML2Text() 248 | h.emphasis = True 249 | h.bypass_tables = False 250 | h.ignore_emphasis = False 251 | h.body_width = 0 252 | h.single_line_break = True 253 | h.bypass_tables = False 254 | h.ignore_images = False 255 | h.images_with_size = True 256 | h.inline_links = True 257 | h.protect_links = True 258 | 259 | # Set the properties Parsedatetime 260 | c = pdt.Constants() 261 | c.YearParseStyle = 0 262 | c.DOWParseStyle = 0 263 | c.CurrentDOWParseStyle = True 264 | p = pdt.Calendar(c) 265 | f = '%Y-%m-%d %H:%M:%S' 266 | 267 | self.uid += 1 268 | try: 269 | answers = self.driver.find_elements_by_xpath( 270 | '//div[contains(@class, "Answer AnswerBase")]') 271 | except NoSuchElementException: 272 | answers = [] 273 | 274 | if len(answers) > 0: 275 | old_position = self.driver.execute_script( 276 | "return document.body.scrollHeight") 277 | 278 | # Scroll the page of question-thread 279 | while True: 280 | self.driver.execute_script( 281 | "window.scrollTo(0, document.body.scrollHeight);") 282 | if self.driver.find_element_by_xpath( 283 | '//div[contains(@class,"pager_next")]') \ 284 | .is_displayed(): 285 | try: 286 | self.wait.until(ec.invisibility_of_element_located( 287 | (By.CLASS_NAME, "pager_next"))) 288 | except TimeoutException: 289 | self.driver.refresh() 290 | 291 | time.sleep(1) 292 | new_pos = self.driver.execute_script( 293 | "return document.body.scrollHeight") 294 | if new_pos == old_position: 295 | break 296 | old_position = self.driver.execute_script( 297 | "return document.body.scrollHeight") 298 | 299 | grid = self.driver.find_element_by_class_name('AnswerListDiv') 300 | answers = grid.find_elements_by_xpath( 301 | './/div[contains(@class, "Answer AnswerBase")]') 302 | try: 303 | self.wait.until(ec.invisibility_of_element_located( 304 | (By.CLASS_NAME, "toggled_spinner"))) 305 | except TimeoutException: 306 | pass 307 | time.sleep(0.5) 308 | 309 | # Creation of ITEM QUESTION 310 | itemquest = ProjectQuoraItem() 311 | question = self.driver.find_element_by_class_name('QuestionArea') 312 | 313 | itemquest['uid'] = str(self.uid) 314 | itemquest['type'] = "question" 315 | try: 316 | author = right_content.find_element_by_xpath( 317 | './/div[contains(@class, "FollowerFacepile clearfix")]' + 318 | '//img[contains(@class, "profile_photo_img")]') 319 | itemquest['author'] = author.get_attribute('alt').encode( 320 | 'utf8', 'ignore') 321 | except NoSuchElementException: 322 | itemquest['author'] = "Anonymous" 323 | pass 324 | 325 | try: 326 | for elem in right_content.find_elements_by_xpath( 327 | './/div[contains(@class, "HighlightRow")]'): 328 | if " View" in elem.text.encode('utf8'): 329 | view = elem.text.encode('utf8') 330 | view = re.match(r'(.*) View.*', view) 331 | itemquest['views'] = int( 332 | view.group(1).replace(',', '')) 333 | except NoSuchElementException: 334 | itemquest['views'] = 0 335 | pass 336 | 337 | try: 338 | date_time = right_content.find_element_by_xpath( 339 | './/div[contains(@class, "HighlightRow AskedRow")]') \ 340 | .text.encode('utf8') 341 | date_time = re.sub(re.compile('Last asked: '), '', date_time) 342 | data_format = p.parseDT(date_time) 343 | itemquest['date_time'] = data_format[0].strftime(f) 344 | except NoSuchElementException: 345 | itemquest['date_time'] = '0000-00-00 00:00:00' 346 | pass 347 | 348 | try: 349 | itemquest['title'] = question.find_element_by_xpath( 350 | './/span[contains(@class, "inline_editor_value")]/h1') \ 351 | .text.encode('utf8', 'ignore') 352 | except NoSuchElementException: 353 | itemquest['title'] = 'null' 354 | pass 355 | 356 | try: 357 | content = question.find_element_by_css_selector( 358 | 'div[id*="full_text"]') 359 | 360 | # Inserting markdown to delimit the code 361 | html_string = content.get_attribute('innerHTML') 362 | html_string = re.sub( 363 | re.compile('
(.*?)', 369 | r'`\1`', html_string) 370 | html_string = html_string.replace('
', '') 371 | html_string = html_string.replace('', '') 372 | html_string = re.sub(r'\[code\](.*?)\[/code\]', r'```\1```', 373 | html_string) 374 | html_string = re.sub(r'