├── .gitignore
├── LICENSE
├── README.md
├── quora
    ├── Project_Quora
    │   ├── Project_Quora
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── quora.py
    │   │   │   └── topic
    │   │   │       ├── Computer-Programming.txt
    │   │   │       └── topic.py
    │   └── scrapy.cfg
    ├── README.md
    ├── quora.bat
    ├── requirements.txt
    └── topic.bat
├── scn
    ├── README.md
    ├── RUN.bat
    ├── discretizer
    │   ├── RUN.bat
    │   ├── discretization.py
    │   └── scn_discretizer.py
    ├── requirements.txt
    └── scnscraper
    │   ├── dataStoring.py
    │   ├── items.py
    │   ├── main.py
    │   └── scraper.py
└── yahoo-answers
    ├── README.md
    ├── discretizer
        ├── discretizer.py
        └── main.py
    ├── requirements.txt
    ├── yahooscraper.sh
    ├── yahooscraper
        └── yahooscraper
        │   ├── scrapy.cfg
        │   └── yahooscraper
        │       ├── __init__.py
        │       ├── items.py
        │       ├── pipelines.py
        │       ├── settings.py
        │       └── spiders
        │           ├── YahooScraper.py
        │           ├── __init__.py
        │           └── example_database.pdl
    ├── yahoourlextractor.sh
    └── yahoourlextractor
        ├── YahooUrlSearcher
            ├── __init__.py
            ├── items.py
            ├── pipelines.py
            ├── settings.py
            └── spiders
            │   ├── __init__.py
            │   └── yahoourlspider.py
        └── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | .idea/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | .idea/
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Collaborative Development Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # qa-scrapers
 2 | 
 3 | A collection of Python scripts that leverage Selenium and/or Scrapy to scrape content from Question Answering sites, such as and other than Stack Overflow.
 4 | 
 5 | So far, three scrapers are available:
 6 | * [Yahoo! Answers](yahoo-answers/README.md)
 7 | * [Quora](quora/README.md)
 8 | * [SAP Community Network](scn/README.md)
 9 | 
10 | Please, refer to the README.md files within each subfolder for more.
11 | 
12 | ## Fair use policy
13 | Pleace, cite the following paper if you decide to use these scripts for your own research purposes.
14 | 
15 | > F. Calefato, F. Lanubile, N. Novielli. “[Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums](http://collab.di.uniba.it/fabio/wp-content/uploads/sites/5/2014/05/a13-calefato.pdf).” In *Proc. 10th Int’l Symposium on Empirical Softw. Eng. and Measurement (ESEM’16)*, Ciudad Real, Spain, Sept. 8-9, 2016, DOI:[10.1145/2961111.2962585](http://doi.acm.org/10.1145/2961111.2962585).
16 | 
17 | ```latex
18 | @inproceedings{calefato_2016_esem,
19 |  author = {Calefato, Fabio and Lanubile, Filippo and Novielli, Nicole},
20 |  title = {Moving to Stack Overflow: Best-Answer Prediction in Legacy Developer Forums},
21 |  booktitle = {Proc. of the 10th ACM/IEEE Int'l Symposium on Empirical Software Engineering and Measurement},
22 |  series = {ESEM '16},
23 |  year = {2016},
24 |  isbn = {978-1-4503-4427-2},
25 |  location = {Ciudad Real, Spain},
26 |  pages = {13:1--13:10},
27 |  articleno = {13},
28 |  numpages = {10},
29 |  url = {http://doi.acm.org/10.1145/2961111.2962585},
30 |  doi = {10.1145/2961111.2962585},
31 |  acmid = {2962585},
32 |  publisher = {ACM},
33 |  address = {New York, NY, USA},
34 |  keywords = {Best-answer prediction, Developer forums, Q\&A sites, Stack Overflow},
35 | }
36 | ```
37 | 
38 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/quora/Project_Quora/Project_Quora/__init__.py


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ProjectQuoraItem(scrapy.Item):
12 |     uid = scrapy.Field()  # Id of a question (e.g 1), Id of an answer (e.g 1.1)
13 |     type = scrapy.Field()  # question, answer
14 |     author = scrapy.Field()  # author of a question or an answer
15 |     title = scrapy.Field()  # title of a question, null for an answer
16 |     text = scrapy.Field()  # text of a question or an answer
17 |     date_time = scrapy.Field()  # when a question or an answer was written
18 |     tags = scrapy.Field()  # topics associated to the question, null for answer
19 |     views = scrapy.Field()  # views of a questions or an answer
20 |     answers = scrapy.Field()  # number of answers for a question, 0 for answers
21 |     resolve = scrapy.Field()  # always null
22 |     upvotes = scrapy.Field()  # likes for a question (null) or an answers
23 |     url = scrapy.Field()  # url of a question or an answer
24 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from pydblite import Base
 9 | import os
10 | import json
11 | from scrapy.xlib.pydispatch import dispatcher
12 | from scrapy import signals
13 | 
14 | 
15 | class DBPipeline(object):
16 |     # Pipeline to write an Item in the database
17 |     def open_spider(self, spider):
18 |         # Creation of DB
19 |         self.db = Base(spider.database)
20 |         self.db.create('uid', 'type', 'author', 'title', 'text', 'date_time',
21 |                        'tags', 'views', 'answers', 'resolve', 'upvotes', 'url',
22 |                        mode="override")
23 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
24 | 
25 |     def process_item(self, item, spider):
26 |         # Writing of the item
27 |         self.db.insert(uid=item['uid'],
28 |                        type=item['type'],
29 |                        author=item['author'],
30 |                        title=item['title'],
31 |                        text=item['text'],
32 |                        date_time=item['date_time'],
33 |                        tags=item['tags'],
34 |                        views=item['views'],
35 |                        answers=item['answers'],
36 |                        resolve=item['resolve'],
37 |                        upvotes=item['upvotes'],
38 |                        url=item['url']
39 |                        )
40 | 
41 |         self.db.commit()
42 |         return item
43 | 
44 |     def spider_closed(self, spider):
45 |         # Number of items saved, shown at the end
46 |         i = 0
47 |         j = 0
48 |         for r in self.db:
49 | 
50 |             if r["type"] == "question":
51 |                 i += 1
52 |             else:
53 |                 j += 1
54 | 
55 |         print ('Number of questions and answers found:')
56 |         print (str(i) + ' questions \n')
57 |         print (str(j) + ' answers \n')
58 | 
59 | 
60 | class JsonWriterPipeline(object):
61 |     # Pipeline to write an Item in Json File
62 |     def __init__(self):
63 |         if os.path.exists('items.json'):
64 |             os.remove('items.json')
65 | 
66 |         self.file = open('items.json', 'wb')
67 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
68 | 
69 |     def process_item(self, item, spider):
70 |         line = json.dumps(dict(item)) + "\n"
71 |         self.file.write(line)
72 |         return item
73 | 
74 |     def spider_closed(self, spider):
75 |         self.file.close()
76 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Project_Quora project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Project_Quora'
13 | 
14 | SPIDER_MODULES = ['Project_Quora.spiders']
15 | NEWSPIDER_MODULE = 'Project_Quora.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Project_Quora (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | CONCURRENT_REQUESTS=1
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | DOWNLOAD_DELAY=0.5
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | COOKIES_ENABLED=True
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'Project_Quora.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'Project_Quora.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | ITEM_PIPELINES = {
65 |     'Project_Quora.pipelines.DBPipeline': 300,
66 |     'Project_Quora.pipelines.JsonWriterPipeline': 800,
67 | }
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
71 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
72 | #AUTOTHROTTLE_ENABLED=True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY=5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY=60
77 | # Enable showing throttling stats for every response received:
78 | #AUTOTHROTTLE_DEBUG=False
79 | 
80 | # Enable and configure HTTP caching (disabled by default)
81 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
82 | #HTTPCACHE_ENABLED=True
83 | #HTTPCACHE_EXPIRATION_SECS=0
84 | #HTTPCACHE_DIR='httpcache'
85 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
86 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
87 | 
88 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/spiders/quora.py:
--------------------------------------------------------------------------------
  1 | from random import randint
  2 | import time
  3 | import platform
  4 | import scrapy
  5 | import glob
  6 | import html2text
  7 | import parsedatetime as pdt
  8 | from selenium.webdriver import DesiredCapabilities
  9 | from selenium.webdriver.common.by import By
 10 | from selenium.webdriver.support.ui import WebDriverWait
 11 | from selenium.webdriver.support import expected_conditions as ec
 12 | from selenium import webdriver
 13 | from selenium.common.exceptions import NoSuchElementException
 14 | from selenium.common.exceptions import TimeoutException
 15 | import codecs
 16 | from ..items import ProjectQuoraItem
 17 | import re
 18 | from scrapy import signals
 19 | from scrapy.xlib.pydispatch import dispatcher
 20 | 
 21 | 
 22 | class QuoraSpider(scrapy.Spider):
 23 |     name = "quora"  # Name of Spider
 24 |     allowed_domains = ["quora.com"]
 25 |     uid = 0  # Id of question-thread
 26 |     list_topic = []
 27 |     database = ''
 28 | 
 29 |     # Creation of the list of topics
 30 |     if "Windows" == platform.system():
 31 |         list_of_files = glob.glob('Topic/*.txt')
 32 |     else:
 33 |         list_of_files = glob.glob('Topic\*.txt')
 34 | 
 35 |     for filename in list_of_files:
 36 |         lines = open(filename, "r").readlines()
 37 |         for line in lines:
 38 |             list_topic.append("<" + line.rstrip('\n') + ">")
 39 | 
 40 |     list_topic = set(list_topic)
 41 | 
 42 |     def __init__(self, *args, **kwargs):
 43 |         super(QuoraSpider, self).__init__(*args, **kwargs)
 44 |         # Arguments passed through the batch file quora.bat
 45 |         self.database = kwargs.get('database') + '.pdl'
 46 |         email = kwargs.get('email')
 47 |         passw = kwargs.get('password')
 48 | 
 49 |         # When Spider quits will call the function spider_closed()
 50 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
 51 | 
 52 |         # Opening PhantomJS webdriver with certain settings
 53 |         options = ['--proxy-type=none', '--load-images=false']
 54 |         if platform.system() == "Windows":
 55 |             self.driver = webdriver.PhantomJS(service_args=options)
 56 |         else:
 57 |             self.driver = webdriver.PhantomJS(executable_path='./phantomjs',
 58 |                                               service_args=options)
 59 |         self.driver.set_window_size(1920, 1080)
 60 |         self.wait = WebDriverWait(self.driver, 60)
 61 | 
 62 |         # Access to Quora and Login
 63 |         self.driver.get("http://www.quora.com/")
 64 |         self.driver.refresh()
 65 |         time.sleep(2)
 66 | 
 67 |         print ('Login to Quora..')
 68 |         while True:
 69 |             # Entering your username and password
 70 |             form = self.driver.find_element_by_class_name('login')
 71 | 
 72 |             username = form.find_element_by_name('email')
 73 |             username.send_keys(email)
 74 |             time.sleep(2)
 75 |             password = form.find_element_by_name('password')
 76 |             password.send_keys(passw)
 77 | 
 78 |             time.sleep(2)
 79 |             form.find_element_by_xpath(
 80 |                 ".//input[contains(@value, 'Login')]").click()
 81 |             time.sleep(2)
 82 | 
 83 |             try:
 84 |                 if self.driver.find_element_by_css_selector(
 85 |                         'div[id*="_error"]').is_displayed():
 86 |                     self.driver.refresh()
 87 |                     print ('Login Error.Retry')
 88 |                     email = raw_input("Insert username: ")
 89 |                     passw = raw_input("Insert password: ")
 90 |             except NoSuchElementException:
 91 |                 break
 92 | 
 93 |     def start_requests(self):
 94 |         # Request for parsing the '/all-questions' section of a topic
 95 | 
 96 |         for filename in self.list_of_files:
 97 |             filename = filename.replace('\\', '')
 98 |             filename = filename.replace('/', '')
 99 |             filename = filename.replace('Topic', '')
100 |             filename = filename.replace('.txt', '')
101 |             yield scrapy.Request('https://www.quora.com/topic/' +
102 |                                  filename + '/all_questions', self.parse)
103 | 
104 |     def spider_closed(self, spider):
105 |         self.driver.close()
106 | 
107 |     def parse(self, response):
108 |         # Opening the '/all-questions' section of a topic
109 |         self.driver.get(response.url)
110 | 
111 |         old_position = self.driver.execute_script(
112 |             "return document.body.scrollHeight")
113 | 
114 |         # Scroll-down with with Selenium
115 |         while True:
116 |             self.driver.execute_script(
117 |                 "window.scrollTo(0, document.body.scrollHeight);")
118 | 
119 |             # Visibility of feedback at the bottom of the page after the scroll
120 |             # Wait until is visible
121 |             if self.driver.find_element_by_xpath(
122 |                     '//div[contains(@class,"pager_next")]').is_displayed():
123 |                 try:
124 |                     self.wait.until(ec.invisibility_of_element_located(
125 |                         (By.CLASS_NAME, "pager_next")))
126 |                 except TimeoutException:
127 |                     self.driver.refresh()
128 | 
129 |             time.sleep(1)
130 |             new_pos = self.driver.execute_script(
131 |                 "return document.body.scrollHeight")
132 | 
133 |             # Check the size of the page
134 |             # If the dimensions are the same, stop the scroll-down
135 |             if new_pos == old_position:
136 |                 sleep = 0
137 |                 self.driver.execute_script(
138 |                     "$('html,body').animate({scrollTop: 0}, 2000);")
139 |                 time.sleep(randint(4, 9))
140 | 
141 |                 while self.driver.execute_script(
142 |                         "return document.body.scrollHeight") == old_position \
143 |                         and sleep != 100:
144 |                     self.driver.execute_script(
145 |                         "window.scrollTo(0, document.body.scrollHeight);")
146 |                     time.sleep(1)
147 |                     sleep += 1
148 | 
149 |                 if sleep == 100:
150 |                     break
151 | 
152 |             old_position = self.driver.execute_script(
153 |                 "return document.body.scrollHeight")
154 |             post_elems = self.driver.find_elements_by_class_name(
155 |                 "pagedlist_item")
156 |             print ('Question found: ' + str(len(post_elems)))
157 | 
158 |         # Extraction of urls questions with selectors
159 |         post_elems = self.driver.find_elements_by_class_name("pagedlist_item")
160 |         url_list = []
161 |         for post in post_elems:
162 |             url_list.append(post.find_element_by_xpath(
163 |                 './/a[contains(@class,"question_link")]')
164 |                             .get_attribute('href'))
165 |         url_list = set(url_list)
166 | 
167 |         # Request for parsing the question-thread
168 |         for url in url_list:
169 |             url_scrapy = response.urljoin(url)
170 |             yield scrapy.Request(url_scrapy, callback=self.parse_question)
171 | 
172 |     def parse_question(self, response):
173 |         # Creation of the list of tags of the question
174 |         tag_string = ""
175 |         tags = response.xpath('//div[contains(@class,' +
176 |                               '"QuestionTopicHorizontalList TopicList")]' +
177 |                               '//span[contains(@class,' +
178 |                               ' "TopicNameSpan TopicName")]/text()').extract()
179 |         for tag in tags:
180 |             tag_string = tag_string + "<" + tag.encode('utf8') + "> "
181 | 
182 |         found = False
183 |         for topic in self.list_topic:
184 |             if topic in tag_string:
185 |                 found = True
186 |                 break
187 |         '''
188 |         The question will be scanned if it has at least one topic in list_topic
189 |         '''
190 |         if found:
191 |             # Related questions
192 |             url_related = response.xpath('//li[contains(@class,' +
193 |                                          '"related_question")]' +
194 |                                          '//a[contains(@class, ' +
195 |                                          '"question_link")]/@href').extract()
196 |             # Request for parsing the related question-threads
197 |             for url in url_related:
198 |                 url_scrapy = response.urljoin(url)
199 |                 yield scrapy.Request(url_scrapy, callback=self.parse_question)
200 | 
201 |             # Page loading of question-thread
202 |             self.driver.get(response.url)
203 |             right_content = self.driver. \
204 |                 find_element_by_xpath('//div[contains(@class,' +
205 |                                       '"HighlightsSection SimpleToggle ' +
206 |                                       'Toggle")]')
207 |             # Show the content of a Rigth Side bar
208 |             try:
209 |                 if right_content.find_element_by_xpath(
210 |                         './/span/a[contains(@class,"expand_link")]') \
211 |                         .is_displayed():
212 | 
213 |                     more_btn = right_content.find_element_by_xpath(
214 |                         './/span/a[contains(@class,"expand_link")]')
215 | 
216 |                     while True:
217 |                         try:
218 |                             self.wait.until(ec.element_to_be_clickable(
219 |                                 (By.XPATH,
220 |                                  '//span/a[contains(@class,"expand_link")]')))
221 |                             break
222 |                         except TimeoutException:
223 |                             self.driver.refresh()
224 | 
225 |                     webdriver.ActionChains(self.driver).move_to_element(
226 |                         more_btn).click(more_btn).perform()
227 | 
228 |                     self.wait.until(ec.invisibility_of_element_located(
229 |                         (By.XPATH, more_btn)))
230 |                     time.sleep(1)
231 | 
232 |                     right_content = self.driver.find_element_by_xpath(
233 |                         '//div[contains(@class,' +
234 |                         '"QuestionPageRightLoggedInSidebar")]')
235 |                     right_content = right_content.find_element_by_css_selector(
236 |                         'div[id*="_expanded"]')
237 | 
238 |             except NoSuchElementException:
239 |                 right_content = self.driver.find_element_by_xpath(
240 |                     '//div[contains(@class,' +
241 |                     '"QuestionPageRightLoggedInSidebar")]')
242 |                 right_content = right_content.find_element_by_css_selector(
243 |                     'div[id*="__truncated"]')
244 | 
245 |             # Set the properties of Html2text
246 |             item_list = []
247 |             h = html2text.HTML2Text()
248 |             h.emphasis = True
249 |             h.bypass_tables = False
250 |             h.ignore_emphasis = False
251 |             h.body_width = 0
252 |             h.single_line_break = True
253 |             h.bypass_tables = False
254 |             h.ignore_images = False
255 |             h.images_with_size = True
256 |             h.inline_links = True
257 |             h.protect_links = True
258 | 
259 |             # Set the properties Parsedatetime
260 |             c = pdt.Constants()
261 |             c.YearParseStyle = 0
262 |             c.DOWParseStyle = 0
263 |             c.CurrentDOWParseStyle = True
264 |             p = pdt.Calendar(c)
265 |             f = '%Y-%m-%d %H:%M:%S'
266 | 
267 |             self.uid += 1
268 |             try:
269 |                 answers = self.driver.find_elements_by_xpath(
270 |                     '//div[contains(@class, "Answer AnswerBase")]')
271 |             except NoSuchElementException:
272 |                 answers = []
273 | 
274 |             if len(answers) > 0:
275 |                 old_position = self.driver.execute_script(
276 |                     "return document.body.scrollHeight")
277 | 
278 |                 # Scroll the page of question-thread
279 |                 while True:
280 |                     self.driver.execute_script(
281 |                         "window.scrollTo(0, document.body.scrollHeight);")
282 |                     if self.driver.find_element_by_xpath(
283 |                             '//div[contains(@class,"pager_next")]') \
284 |                             .is_displayed():
285 |                         try:
286 |                             self.wait.until(ec.invisibility_of_element_located(
287 |                                 (By.CLASS_NAME, "pager_next")))
288 |                         except TimeoutException:
289 |                             self.driver.refresh()
290 | 
291 |                     time.sleep(1)
292 |                     new_pos = self.driver.execute_script(
293 |                         "return document.body.scrollHeight")
294 |                     if new_pos == old_position:
295 |                         break
296 |                     old_position = self.driver.execute_script(
297 |                         "return document.body.scrollHeight")
298 | 
299 |                 grid = self.driver.find_element_by_class_name('AnswerListDiv')
300 |                 answers = grid.find_elements_by_xpath(
301 |                     './/div[contains(@class, "Answer AnswerBase")]')
302 |             try:
303 |                 self.wait.until(ec.invisibility_of_element_located(
304 |                     (By.CLASS_NAME, "toggled_spinner")))
305 |             except TimeoutException:
306 |                 pass
307 |             time.sleep(0.5)
308 | 
309 |             # Creation of ITEM QUESTION
310 |             itemquest = ProjectQuoraItem()
311 |             question = self.driver.find_element_by_class_name('QuestionArea')
312 | 
313 |             itemquest['uid'] = str(self.uid)
314 |             itemquest['type'] = "question"
315 |             try:
316 |                 author = right_content.find_element_by_xpath(
317 |                     './/div[contains(@class, "FollowerFacepile clearfix")]' +
318 |                     '//img[contains(@class, "profile_photo_img")]')
319 |                 itemquest['author'] = author.get_attribute('alt').encode(
320 |                     'utf8', 'ignore')
321 |             except NoSuchElementException:
322 |                 itemquest['author'] = "Anonymous"
323 |                 pass
324 | 
325 |             try:
326 |                 for elem in right_content.find_elements_by_xpath(
327 |                         './/div[contains(@class, "HighlightRow")]'):
328 |                     if " View" in elem.text.encode('utf8'):
329 |                         view = elem.text.encode('utf8')
330 |                         view = re.match(r'(.*) View.*', view)
331 |                         itemquest['views'] = int(
332 |                             view.group(1).replace(',', ''))
333 |             except NoSuchElementException:
334 |                 itemquest['views'] = 0
335 |                 pass
336 | 
337 |             try:
338 |                 date_time = right_content.find_element_by_xpath(
339 |                     './/div[contains(@class, "HighlightRow AskedRow")]') \
340 |                     .text.encode('utf8')
341 |                 date_time = re.sub(re.compile('Last asked: '), '', date_time)
342 |                 data_format = p.parseDT(date_time)
343 |                 itemquest['date_time'] = data_format[0].strftime(f)
344 |             except NoSuchElementException:
345 |                 itemquest['date_time'] = '0000-00-00 00:00:00'
346 |                 pass
347 | 
348 |             try:
349 |                 itemquest['title'] = question.find_element_by_xpath(
350 |                     './/span[contains(@class, "inline_editor_value")]/h1') \
351 |                     .text.encode('utf8', 'ignore')
352 |             except NoSuchElementException:
353 |                 itemquest['title'] = 'null'
354 |                 pass
355 | 
356 |             try:
357 |                 content = question.find_element_by_css_selector(
358 |                     'div[id*="full_text"]')
359 | 
360 |                 # Inserting markdown to delimit the code
361 |                 html_string = content.get_attribute('innerHTML')
362 |                 html_string = re.sub(
363 |                     re.compile('<td class="linenos">.*?</td>', re.DOTALL), '',
364 |                     html_string)
365 |                 html_string = re.sub(r'<ol class="linenums">(.*?)</ol>',
366 |                                      r'```\1```', html_string)
367 |                 html_string = re.sub(
368 |                     r'<pre class="prettyprint inline prettyprinted".*?>(.*?)</pre>',
369 |                     r'`\1`', html_string)
370 |                 html_string = html_string.replace('<pre>', '')
371 |                 html_string = html_string.replace('</pre>', '')
372 |                 html_string = re.sub(r'\[code\](.*?)\[/code\]', r'```\1```',
373 |                                      html_string)
374 |                 html_string = re.sub(r'<td class="code">(.*?)</td>',
375 |                                      r'```\1```', html_string)
376 |                 html_string = re.sub(
377 |                     r'<div class="codeblock inline_codeblock">(.*?)</div>',
378 |                     r'`\1`', html_string)
379 | 
380 |                 if (h.handle(html_string) != '\n\n' or
381 |                         h.handle(html_string != '\n')):
382 |                     itemquest['text'] = h.handle(html_string) \
383 |                         .encode('utf8', 'ignore')
384 |                 else:
385 |                     itemquest['text'] = 'null'
386 |             except NoSuchElementException:
387 |                 itemquest['text'] = 'null'
388 |                 pass
389 | 
390 |             itemquest['tags'] = tag_string.encode('utf8')
391 |             itemquest['answers'] = len(answers)
392 |             itemquest['resolve'] = 'null'
393 |             itemquest['upvotes'] = 0
394 |             itemquest['url'] = response.url
395 | 
396 |             item_list.append(itemquest)
397 | 
398 |             # Creation of N-ITEM ANSWER
399 |             if len(answers) > 0:
400 |                 i = 1
401 |                 for ans in answers:
402 |                     itemans = ProjectQuoraItem()
403 |                     itemans['uid'] = str(self.uid) + "." + str(i)
404 |                     itemans['type'] = "answer"
405 | 
406 |                     try:
407 |                         itemans['author'] = ans.find_element_by_xpath(
408 |                             './/img[contains(@class, "profile_photo_img")]') \
409 |                             .get_attribute('alt').encode('utf8', 'ignore')
410 |                     except NoSuchElementException:
411 |                         itemans['author'] = "Anonymous"
412 |                         pass
413 | 
414 |                     itemans['title'] = 'null'
415 | 
416 |                     try:
417 |                         if ans.find_element_by_xpath(
418 |                                 './/a[contains(@class, "more_link")]') \
419 |                                 .is_displayed():
420 |                             more = ans.find_element_by_xpath(
421 |                                 './/a[contains(@class, "more_link")]')
422 |                             self.driver.execute_script(
423 |                                 "arguments[0].scrollIntoView(true);", more)
424 |                             self.driver.execute_script(
425 |                                 "window.scrollBy(0,-250);")
426 | 
427 |                             webdriver.ActionChains(self.driver) \
428 |                                 .move_to_element(more) \
429 |                                 .click(more).perform()
430 | 
431 |                             self.wait.until(ec.invisibility_of_element_located(
432 |                                 (By.CLASS_NAME, 'loading')))
433 |                             time.sleep(1)
434 |                     except NoSuchElementException:
435 |                         pass
436 | 
437 |                     try:
438 |                         content = ans.find_element_by_class_name(
439 |                             'inline_editor_value')
440 | 
441 |                         # Inserting markdown to delimit the code
442 |                         html_string = content.get_attribute('innerHTML')
443 |                         html_string = re.sub(re.compile(
444 |                             '<div class="OriginallyAnsweredBanner">.*?</div>',
445 |                             re.DOTALL), '', html_string)
446 |                         html_string = re.sub(
447 |                             re.compile('<td class="linenos">.*?</td>',
448 |                                        re.DOTALL), '', html_string)
449 |                         html_string = re.sub(re.compile(
450 |                             '<div class="ContentFooter AnswerFooter" .*?</div>',
451 |                             re.DOTALL), '', html_string)
452 |                         html_string = re.sub(
453 |                             '<a class="user".*?action_mousedown="UserLinkClickthrough".*?</a>',
454 |                             '', html_string)
455 |                         html_string = re.sub(
456 |                             r'<ol class="linenums">(.*?)</ol>',
457 |                             r'```\1```', html_string)
458 |                         html_string = re.sub(
459 |                             r'<pre class="prettyprint inline prettyprinted".*?>(.*?)</pre>',
460 |                             r'`\1`', html_string)
461 |                         html_string = html_string.replace('<pre>', '')
462 |                         html_string = html_string.replace('</pre>', '')
463 |                         html_string = re.sub(r'\[code\](.*?)\[/code\]',
464 |                                              r'```\1```', html_string)
465 |                         html_string = re.sub(r'<td class="code">(.*?)</td>',
466 |                                              r'```\1```', html_string)
467 |                         html_string = re.sub(
468 |                             r'<div class="codeblock inline_codeblock">(.*?)</div>',
469 |                             r'`\1`', html_string)
470 | 
471 |                         itemans['text'] = h.handle(html_string). \
472 |                             encode('utf8', 'ignore')
473 |                     except NoSuchElementException:
474 |                         itemans['text'] = 'null'
475 |                         pass
476 | 
477 |                     try:
478 |                         date_time = content.find_element_by_class_name(
479 |                             'answer_permalink').text.encode('utf8')
480 |                         date_time = re.sub(re.compile('Written '), '',
481 |                                            date_time)
482 |                         date_time = re.sub(re.compile('Updated '), '',
483 |                                            date_time)
484 |                         data_format = p.parseDT(date_time)
485 |                         itemans['date_time'] = data_format[0].strftime(f)
486 |                     except NoSuchElementException:
487 |                         itemans['date_time'] = '0000-00-00 00:00:00'
488 |                         pass
489 | 
490 |                     itemans['tags'] = 'null'
491 |                     views = ans.find_element_by_class_name(
492 |                         'CredibilityFact').text.encode('utf8')
493 | 
494 |                     try:
495 |                         if 'k' in views:
496 |                             match = re.search(r'(.*?)k Views', views)
497 |                             views = int(float(match.group(1)) * 1000)
498 |                         else:
499 |                             match = re.search(r'(.*?) Views', views)
500 |                             views = int(match.group(1))
501 |                     except AttributeError:
502 |                         views = 0
503 |                         pass
504 | 
505 |                     itemans['views'] = views
506 |                     itemans['answers'] = 0
507 |                     itemans['resolve'] = 'null'
508 | 
509 |                     upvotes = ans.find_element_by_xpath(
510 |                         './/div[contains(@class,"action_bar_inner")]' +
511 |                         '/span/a/span[2]').text.encode('utf8')
512 | 
513 |                     if len(upvotes) > 0:
514 |                         if 'k' in upvotes:
515 |                             upvotes = re.sub(re.compile('k'), '', upvotes)
516 |                             upvotes = int(float(upvotes) * 1000)
517 |                             itemans['upvotes'] = upvotes
518 |                         else:
519 |                             itemans['upvotes'] = int(upvotes)
520 |                     else:
521 |                         itemans['upvotes'] = 0
522 | 
523 |                     itemans['url'] = ans.find_element_by_class_name(
524 |                         'answer_permalink').get_attribute('href') \
525 |                         .encode('utf8')
526 | 
527 |                     i += 1
528 |                     item_list.append(itemans)
529 | 
530 |             # Release of the items instantiated
531 |             for item in item_list:
532 |                 yield item
533 |                 print "\n"
534 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/spiders/topic/Computer-Programming.txt:
--------------------------------------------------------------------------------
  1 | npm (package manager)
  2 | GPU Computation
  3 | Ruby on Rails (web framework)
  4 | contentEditable
  5 | Glasgow Haskell Compiler
  6 | Object-Oriented Software Construction
  7 | Practice of Computer Programming
  8 | Specific Projects Using Clojure
  9 | Browser Compatibility
 10 | Mercurial (software)
 11 | Smalltalk (programming language)
 12 | Learning C++
 13 | CSS Rotate
 14 | JavaScript Minification
 15 | Iframes
 16 | Facebook Bots
 17 | PySparse
 18 | Fluid Layout
 19 | Psyco (Python compiler)
 20 | Appectual IT Solutions
 21 | OpenGL
 22 | HipHop for PHP
 23 | USA Computing Olympiad (USACO)
 24 | Meteor (Javascript platform)
 25 | JavaScript Libraries
 26 | PHP Performance
 27 | Learning Ruby
 28 | Jackson JSON Processor
 29 | DirectCompute
 30 | Capybara (testing framework)
 31 | Python 3
 32 | Debugging
 33 | ECMAScript 3
 34 | ECMAScript 2
 35 | ECMAScript 1
 36 | ECMAScript 7
 37 | ECMAScript 6
 38 | ECMAScript 5
 39 | Sencha Touch
 40 | JIT
 41 | Groovy (programming language)
 42 | Loop (programming)
 43 | HBase
 44 | Akka
 45 | Learning BASIC
 46 | JSON-LD
 47 | Sample Code
 48 | DataMapper
 49 | Java Mobile Apps
 50 | Hydra
 51 | Pure (Programming Language)
 52 | URL Rewriting
 53 | Spring Framework
 54 | Scalaz
 55 | Nitro (JavaScript engine)
 56 | Recurse Center
 57 | Web Application Frameworks
 58 | Programming JavaScript Applications (2014 book)
 59 | Codeforces
 60 | Backbone.js
 61 | Blockly
 62 | JavaScript Application Development
 63 | ECMAScript Proxies
 64 | Python Programming
 65 | GitHub Student Developer Pack
 66 | Django 1.4
 67 | EventMachine
 68 | Learning Processing
 69 | ECMAScript
 70 | SpiderMonkey (JavaScript engine)
 71 | Pyramid (web framework)
 72 | NowJS
 73 | Head First JavaScript Programming (2014 book)
 74 | GNU grep
 75 | AMQP
 76 | Programming Bootcamps
 77 | Quixey Challenge
 78 | Web Scraping
 79 | Berkeley DB
 80 | Ruby Blocks
 81 | HTML
 82 | Cascalog
 83 | Twilio
 84 | Serialization
 85 | CouchApps
 86 | Threading in Python
 87 | NOLOH
 88 | Modula-3
 89 | YUI (Yahoo! User Interface) Library
 90 | Fabric (software)
 91 | Microdata
 92 | Competitive Programmers
 93 | Haskell in Industry
 94 | Node.js Modules
 95 | jclouds
 96 | HTML5 Canvas Element
 97 | Titan (graph database)
 98 | Brian Bi
 99 | Brogrammers
100 | Apache 2.0 License
101 | Esotech
102 | Bit Manipulations
103 | DirectX
104 | Verilog
105 | Regular Expressions in Python
106 | Cucumber (BDD framework)
107 | double (data type)
108 | MetroTwit
109 | Hour of Code 2013
110 | YourKit
111 | Orange (Python library)
112 | Python (programming language)
113 | Scaloid (library)
114 | WHATWG
115 | Scratch (programming language)
116 | Web Development Comparison
117 | Twilio Apps
118 | Semantic MediaWiki
119 | TopCoder
120 | D3.js (JavaScript library)
121 | C (programming language)
122 | APL (programming language)
123 | Knockout (JavaScript framework)
124 | Scripting Languages
125 | WSGI Middleware
126 | Pastek
127 | Fast Inverse Square Root
128 | Yii
129 | Computer Programmers
130 | Training for Competitive Programming
131 | Music APIs
132 | MashupXFeed
133 | Dnode
134 | PyCascading
135 | CherryPy
136 | Parallel Patterns Library (Visual C++)
137 | DirectX 11
138 | YUIDoc
139 | Carakan (JavaScript engine)
140 | Language-Specific Cloud APIs
141 | Codecademy
142 | Programming Libraries
143 | JSP
144 | AngularJS
145 | Emacs Lisp
146 | Rexster (Tinkerpop)
147 | Prolog
148 | ECMAScript Implementations
149 | Erlang (programming language)
150 | Indonesia's Selection Process for IOI
151 | Google Scholar API
152 | Mashery
153 | Flatiron School
154 | Xoops
155 | ASP.NET
156 | Static Code Analysis
157 | JavaScript Engines
158 | API Management
159 | Tawesoft
160 | Tomorrow People (company)
161 | Learning COBOL
162 | Rhino (JavaScript)
163 | C++11 (programming language)
164 | ABAP
165 | Microformats
166 | Yukihiro Matsumoto
167 | WebLogic
168 | Learning to Program
169 | JSONP
170 | Neo4j
171 | Major Concepts in Programming Languages
172 | ScriptRock
173 | PyQt
174 | Objective-C (programming language)
175 | fmdb
176 | enStratus API
177 | HTML5 Mobile
178 | XHP
179 | Programming Advice
180 | Online Judges
181 | Processing.js
182 | Twitter Streaming API
183 | Boot Loaders
184 | SWIG (software)
185 | Regular Expressions (computing)
186 | Groovy Frameworks
187 | lxml
188 | Web Programming Languages
189 | Twitter OAuth
190 | Apache Qpid
191 | Suggestions for an Ideal Website
192 | Sexism and Turmoil at GitHub (March 2014)
193 | Github Corporate Affairs
194 | MailChimp (product)
195 | Software Transactional Memory
196 | PHP Developers
197 | Google+ API
198 | vCloud
199 | Learning Java
200 | Prototype (framework)
201 | Principles of Object-Oriented Programming in JavaScript (2014 book)
202 | ACM-ICPC
203 | StackBlaze
204 | Beautiful Soup
205 | Hack Reactor
206 | Regular Expressions in JavaScrpt
207 | Semantics (computer science)
208 | Investing in Github
209 | ECMAScript 4
210 | RequireJS
211 | WebSockets
212 | LuaJIT
213 | CSS3
214 | Python 2.6
215 | Python 2.7
216 | Git (revision control)
217 | Dirty Checking (programming)
218 | CSS Shadows
219 | Programming Interview Questions
220 | MadMimi
221 | JavaScript Application Design (2015 book)
222 | Dojo (JavaScript toolkit)
223 | Zope Object Database
224 | cascading.jruby
225 | MS Access VBA
226 | Rails 3.1
227 | Pearson APIs
228 | Jinja
229 | TurboGears
230 | ECMAScript 5.1
231 | Capistrano
232 | HTML5 Video
233 | Microsoft Application Programming Interface
234 | Using JavaScript with .NET
235 | Google Programming Contest
236 | How to Code X
237 | Netflix API
238 | Learning HTML
239 | Io (programming language)
240 | Test::Unit
241 | Ruby (programming language)
242 | Toronto SEO
243 | Lua (programming language)
244 | Drupal Commerce
245 | Java Specification Request
246 | Browser-based Games
247 | Web Application Architecture
248 | CodeEval
249 | Scikits
250 | Media Queries
251 | Object Oriented Data Technology
252 | Facebook and HTML5
253 | XSL FO
254 | Silverlight
255 | OpenCL
256 | Github Products and Services
257 | C-Based Programming Languages
258 | PhantomJS
259 | Ext JS
260 | Qubole
261 | Dylan (programming language)
262 | Functional Programming in Industry
263 | Competitive Programming
264 | WAMP
265 | Needlebase
266 | IronPython
267 | Autohotkey
268 | Topic Maps
269 | Programming Bootcamps in Boston
270 | Object (programming concept)
271 | Zotonic
272 | JavaScript (programming language)
273 | Zend Framework
274 | SimpleXML
275 | Kohana
276 | Flickr API
277 | Cloud9 IDE
278 | Webmachine
279 | Parsing (computer science)
280 | Learn You a Haskell (2011 book)
281 | jQuery
282 | Indonesia's National Olympiad in Informatics (OSN Informatika)
283 | Web Development Companies
284 | Velocity (JavaScript Animation Frame...
285 | ClojureScript
286 | Learning Swift
287 | eZ Teamroom
288 | App Academy
289 | Code Year
290 | Bottle (web framework)
291 | Clojure (programming language)
292 | Arrays (programming)
293 | Redev
294 | Play Framework
295 | SGML
296 | Software Libraries
297 | Ramaze
298 | WebSphere MQ
299 | Codecademy JavaScript Exercises
300 | Java Frameworks
301 | Method (computer programming)
302 | GitHub Issues
303 | Schizophrenia (programming concept)
304 | HTML5 WYSIWYG Editors
305 | Scraping Technology
306 | uWSGI
307 | Source Code
308 | ICEfaces
309 | Application Binary Interface
310 | Cassandra (database)
311 | ECMAScript Features
312 | .NET Framework
313 | LAMP (software bundle)
314 | Learning SQL
315 | Inheritance (Programming concept)
316 | Future of Web Development
317 | Mod_rewrite
318 | Twisted (software)
319 | Visual Basic
320 | Context.IO
321 | Elegant Code
322 | Gems (Ruby)
323 | eZ Find
324 | Cloud APIs
325 | Functional Programming
326 | Semantic HTML
327 | Alternatives to Twilio
328 | Nexmo API
329 | Delphi (Programming Language) Forums
330 | Hibernate (Java)
331 | Newbox Solutions
332 | Sammy
333 | WS-Factory
334 | Terrastore
335 | IDLs
336 | jQuery Mobile
337 | Siri API
338 | Testtopicforcodeblocks
339 | Pinax
340 | Scala Actors
341 | Learning C#
342 | ASP.NET MVC
343 | Preparing for International Olympiad in Informatics
344 | Quadrax (Tetris clone)
345 | eZ Community
346 | Tiny Frameworks
347 | Arduino and Processing
348 | Testing Frameworks
349 | Distributed Revision Control Systems
350 | Chakra (JavaScript engine)
351 | X10 (programming language)
352 | Ruby Koans
353 | Compojure
354 | Perl 5
355 | Responsive HTML5 Web Templates
356 | Markup Languages
357 | Mutexes
358 | Tumult, Inc.
359 | F# (programming language)
360 | WebCreators.in
361 | Pointers (computer programming)
362 | SQLAlchemy
363 | Pylons (web framework)
364 | PHP (programming language)
365 | JRuby
366 | Commerce Kickstart
367 | Outlook VBA
368 | Mutual Exclusion (software)
369 | MusicBrainz
370 | Specific Countries' Selection Process for IOI
371 | Sequel (software)
372 | Paperclip Rails
373 | 140 Proof
374 | Go (programming language)
375 | Indexer (programming)
376 | HTML Email
377 | Sign in with Twitter
378 | Sinatra (software)
379 | github3.py
380 | Cake Software Foundation
381 | Google APIs
382 | Apache Thrift
383 | JavaScript Books
384 | Monads
385 | RubyMotion
386 | ActiveRecord
387 | Libcloud
388 | Open Graph
389 | CasperJS
390 | SFINAE (Substitution Failure is not an Error)
391 | Ruby vs. Groovy
392 | V8 (JavaScript engine)
393 | FP Complete
394 | Qt (framework)
395 | TestNG
396 | Subversion
397 | Browser Cookies
398 | D (programming language)
399 | Brogramming
400 | Web Architects
401 | Meta Tags
402 | Semantic Advertising
403 | Gremlin
404 | PeakStream
405 | Web Development Educational Resources
406 | Techtic Solutions
407 | CSS3 Animations
408 | Dasein Cloud
409 | JSLint
410 | Dynamic Code Analysis
411 | ECMAScript Classes
412 | Code Composer Studio (CCS)
413 | Dapper
414 | Gensim
415 | Learning Scala
416 | LayerVault
417 | Amiral Agence Web
418 | LLVM
419 | Visual Basic for Applications (VBA)
420 | GitHub Pages
421 | gitignore
422 | ELF
423 | Unladen Swallow
424 | EJB
425 | Building Social Networking Sites
426 | Syllabontes
427 | Erepublik
428 | Celery (distributed task queue)
429 | Zope
430 | RPython
431 | Real World Haskell (2008 book)
432 | Programming Frameworks
433 | C vs. C++
434 | Web Consultants
435 | Compute Unified Device Architecture (CUDA)
436 | char (data type)
437 | GitHub Raises Venture Capital (July 2012)
438 | Avro (software)
439 | CSS Sprites
440 | Twitter API
441 | Chrome Frame
442 | R versus Python
443 | Crocodoc
444 | Type Theory
445 | Learning Delphi
446 | Claim Soluciones
447 | WebPageTest
448 | Google Earth API
449 | Java Libraries
450 | Lift (web framework)
451 | Racket (programming language)
452 | Coding Conventions
453 | Java Developers
454 | GCC (compiler)
455 | Mobile Recharge API
456 | Assembly Language
457 | Node.js Web Frameworks
458 | web.py
459 | JavaScript Frameworks
460 | Java Platform, Enterprise Edition
461 | Functional Programming in Scala (2014 book)
462 | Lisp (programming language)
463 | TypeScript
464 | Dbpedia
465 | Web Testing Framework
466 | string (data type)
467 | 2600hz
468 | ZeroMQ
469 | PHP Frameworks
470 | Programming Syntax
471 | Compiler Optimization
472 | FORTRAN (programming language)
473 | Grok
474 | Pinterest API
475 | BigDecimal
476 | Native Extensions for Microsoft Silverlight
477 | Datomic
478 | Moonstalk
479 | Object Inheritance
480 | Matplotlib
481 | Routes (Software)
482 | Opa
483 | QBasic
484 | Common Data Types in Computer Programming
485 | Typica
486 | Kivy
487 | Agda
488 | Netduino
489 | Groupcache
490 | MongoKit
491 | Google Maps API
492 | Learning Lisp
493 | China's Selection Process for IOI
494 | LOGO (programming language)
495 | Lithium Framework
496 | Major Concepts in Computer Programming
497 | Processing (programming language)
498 | India's Selection Process for IOI
499 | OCaml (programming language)
500 | RabbitMQ
501 | Mirah
502 | Dryad
503 | XHTML
504 | Web Development on Mac OS X
505 | Breezi
506 | Cramp
507 | Bluestar Applications
508 | Types of Computer Programming
509 | Uber API
510 | C++ (programming language)
511 | Computer Programming
512 | Protocol Buffers
513 | Dart (programming language)
514 | SymPy
515 | Mecury (programming language)
516 | nginx
517 | Git Merge Tools
518 | Syntactic Sugar (programming)
519 | Chess Programming
520 | Stackless Python
521 | Sphinx (Python documentation generator)
522 | Python Versions
523 | Void Pointer
524 | APIs, How To
525 | Hyperlinks
526 | Java Specifications
527 | Ruby on Rails Professionals
528 | SproutCore
529 | jemalloc
530 | CodeChef
531 | Bulbs (programming library)
532 | Ruby on Rails Plugins
533 | HTML5 Document Viewer
534 | Revision Control Systems
535 | Apple Swift (programming language)
536 | RESTful APIs
537 | CoffeeScript
538 | Open APIs
539 | Attract Group
540 | Foursquare API
541 | Hour of Code
542 | Table Tags
543 | CouchDB
544 | Criticism of PHP
545 | Selenium (testing framework)
546 | WebFont Loader
547 | 3scale
548 | CloudStack API
549 | Prototypal Inheritance
550 | Web APIs
551 | Ruby 1.9
552 | Memoization
553 | GitEnterprise
554 | Online Programming Bootcamps
555 | Read-Eval-Print Loops
556 | DreamFace Interactive
557 | Alembic
558 | Programming Language Design
559 | Pandas (Python Library)
560 | Java Applets
561 | Zero-day Attacks
562 | Anti-Patterns
563 | Windows Presentation Foundation
564 | Rubber Duck Debugging
565 | Webix
566 | Programming Language Adoption
567 | SML/NJ
568 | Ruby on Rails 4
569 | eZ Publish
570 | Functional Programming Principles in Scala (Coursera course)
571 | D.Labs
572 | WOEID
573 | LaCroix Design Company
574 | MooTools
575 | WSGI
576 | This (programming concept)
577 | Intel Acquires Mashery (April 2013)
578 | Tag Management
579 | Pygame
580 | OpenGL ES 2.0
581 | Adodb
582 | Eiffel (programming language)
583 | jQuery Plugins
584 | GitHub
585 | Ember.js
586 | Node.io
587 | list (data type)
588 | humans.txt
589 | BASIC (programming language)
590 | Wand (ImageMagick binding)
591 | JSON
592 | Twilio Revenue
593 | Gennady Korotkevich (competitive programmer)
594 | Enums
595 | XML
596 | Starter League
597 | Java Virtual Machine (JVM)
598 | Learning Perl
599 | OpenGL on iOS
600 | Garbage Collection (programming)
601 | HotSpot (JVM)
602 | Linus Torvalds
603 | Typesafe (company)
604 | Web IM
605 | Qt Quick
606 | FDT
607 | Scalding
608 | Tornado (web framework)
609 | Flask (Python framework)
610 | Programming for Kids
611 | CSS Frameworks
612 | oXygen XML
613 | Programming in C++
614 | DataNucleus
615 | SMS API
616 | Programming Competitions
617 | Freebase
618 | Gosu (programming language)
619 | Java (programming language)
620 | Aspect-Oriented Programming
621 | Pointers in Structures
622 | malloc
623 | Core Data
624 | Facebook Hacker Cup
625 | Steak (Ruby gem)
626 | Scheme (programming language)
627 | GeoCouch
628 | Darcs
629 | Concurrency (computer science)
630 | Hackbright Academy
631 | JATS (Journal Article Tag Suite)
632 | Object-Oriented Programming
633 | Andrew Tridgell
634 | Mobile UI Design
635 | Jython
636 | Title Tags
637 | Memory Management (computer programming)
638 | Google's Polymer
639 | Learning PHP
640 | Constraint Programming
641 | Facebook Graph API
642 | Struts (for web apps)
643 | Delphi (programming language)
644 | mod_wsgi
645 | The Public Knowledge Workshop (NGO, Israel) -   
646 | Node Version Management
647 | GPGPU
648 | Web Developers
649 | Web Components
650 | Programming Language Comparisons
651 | CodeGuard
652 | HTML5
653 | Containment (programming)
654 | Perl 6
655 | Bloc
656 | Jasmine (framework)
657 | Blueprint (CSS Framework)
658 | HTML Tags
659 | WaveMaker
660 | Learning MATLAB
661 | Programming Languages
662 | MAPI
663 | Memory Management Units
664 | Debuggers
665 | Windows Communication Foundation
666 | wxPython
667 | NotifyMyAndroid
668 | Padrino
669 | Visual FoxPro
670 | Apache Tapestry
671 | Plone
672 | CodeUnion
673 | phpMyAdmin
674 | Devise (Rails authentication Framework)
675 | Google Hosted Libraries
676 | Metaprogramming
677 | Python Web Frameworks
678 | PHP Libraries
679 | Servlets
680 | Message Queuing
681 | Famo.us (JavaScript Framework)
682 | Learning Visual Basic
683 | Apple FaceTime API
684 | Programming Bootcamps in New York
685 | AspectJ
686 | Interweb Systems
687 | The Coder Factory
688 | Python GIL
689 | STL (C++)
690 | VHDL
691 | AppleScript
692 | GitHub Gists
693 | Excel VBA
694 | Facebook API
695 | Study of Computer Programming
696 | Visual Programming
697 | MLton
698 | Front-End Web Development
699 | Bootcamps.in
700 | Semantic Annotation
701 | Django (web framework)
702 | CommonJS
703 | Xapian
704 | PySide
705 | Link Rot
706 | libc
707 | Apprentice.io
708 | Grails
709 | Cascading
710 | Algorithms for Competitive Programming
711 | ECMAScript Operators
712 | Standard ML
713 | AJAX
714 | Learning JavaScript
715 | OrientDB
716 | JSON Web Token
717 | CakePHP
718 | DerbyJS
719 | Learning to Build Websites
720 | Nilecode
721 | DataSift (product)
722 | Unicorn (Ruby gem)
723 | Concatenative Programming Languages
724 | Visual Impact Systems
725 | Scripting (programming)
726 | Email API
727 | Wakanda Server
728 | Django 1.3
729 | CSS Lint
730 | NLP API
731 | Core Java
732 | KML (File Format)
733 | Rails Rumble
734 | JUnit
735 | The Software Guild
736 | Github-Fi
737 | Twitter Cards
738 | Specific Programming Languages
739 | Cascading Style Sheets
740 | Scala
741 | scikit-learn
742 | Koding
743 | Hardware Description Languages (HDL)
744 | PostScript
745 | The Echo Nest
746 | DevBatch - Mobile Apps Development Company
747 | Haxe
748 | Learning Haskell
749 | Path API
750 | Compilers
751 | Ceylon (programming language)
752 | Spin Locks
753 | ooc (programming language)
754 | PyPy
755 | Tumult Hype
756 | Twilio Connect
757 | MQTT
758 | Studio Kudos
759 | Scrapinghub
760 | Windows Identity Foundation
761 | ECMAScript Versions
762 | Sphere Online Judge (SPOJ)
763 | Class (programming)
764 | WITSML
765 | SEO Beaver | Beaver Marketing Inc
766 | Inter-Process Communication
767 | Indirection
768 | Error Messages
769 | Application Programming Interfaces (API)
770 | Plotly
771 | eZ Flow
772 | Gemstone Object Database
773 | MagLev (Ruby interpreter)
774 | Gunicorn
775 | LiveNode
776 | JDBC
777 | Audio Tags
778 | NumPy
779 | Ruby vs. Python
780 | reStructuredText
781 | Node.js
782 | Domain-Specific Languages
783 | Mashape
784 | goto statement (programming)
785 | Xoops Engine
786 | Prototype-based Programming
787 | Type Inference
788 | Hour of Code 2014
789 | AlchemyAPI
790 | Plivo SMS API
791 | Programming Bootcamps in the San Francisco Bay Area
792 | Mako
793 | Haskell (programming language)
794 | Flot (JavaScript library)
795 | Programming Loops
796 | Guice
797 | Java Interview Questions
798 | GitHub for Mac
799 | Rascal
800 | Bobik Scraping Service
801 | Raphael-js
802 | Express (Node.js web framework)
803 | Sass (stylesheet language)
804 | Mediawiki API
805 | IDL (programming language)
806 | International Olympiad in Informatics (IOI)
807 | Python Imaging Library
808 | theLIFT
809 | CodeIgniter
810 | Twilio API
811 | Human-powered APIs
812 | Silex
813 | SWFObject
814 | Systems Programming
815 | ScraperWiki
816 | ACM-ICPC 2016
817 | Learning Specific Programming Languages
818 | Web Development
819 | CodeBenders
820 | Specific Problems in I
821 | React (JS Library)
822 | Brokly API
823 | Semantic Wiki
824 | Multilingual Websites
825 | C# (programming language)
826 | Learning R
827 | Simplify, Advance (company)
828 | QuickBASIC
829 | OOCSS
830 | SailsJS
831 | Alley Interactive
832 | Programming Interviews
833 | Python Libraries
834 | Regular Expressions in Programming Languages
835 | GDB
836 | Coroutines
837 | Functional Programming Languages
838 | PHP Classes
839 | Learning FORTRAN
840 | Plone Products
841 | TensorFlow (software library)
842 | Learning Node.js
843 | Learning Assembly
844 | Cake Development Corporation
845 | int (data type)
846 | Software Bugs
847 | Dev Bootcamp
848 | Semantic Web
849 | Lightstreamer
850 | Learning Python
851 | pip
852 | Funding of Github
853 | Extreme Programming (XP)
854 | CartoDB
855 | Semaphores
856 | CPython
857 | Is There an API for X?
858 | float (data type)
859 | Specific Problems in International Olympiad in Informatics
860 | HTML5 Boilerplate
861 | Common Lisp
862 | Decompilation
863 | Python Implementations
864 | Archetype Definition Language (ADL)
865 | Lcuma Labs (company)
866 | Streaming APIs
867 | Programming Paradigms
868 | Web Architecture
869 | MacRuby
870 | Laravel
871 | Server-Side JavaScript
872 | Active Admin
873 | XML Literals
874 | NDjango
875 | jQuery UI
876 | RSpec
877 | Java Native Interface
878 | SciPy
879 | Multiple Inheritance
880 | Perl (programming language)
881 | Enterprise Message Bus
882 | Java Message Service (JMS)
883 | Mobile Web Development
884 | Memory Leaks (computer programming)
885 | Symfony
886 | Apache Wicket
887 | Apache Hive
888 | Automatic Memoization
889 | Bazaar DVCS
890 | Twitter Firehose
891 | Worldmate
892 | Pyglet
893 | Imperative Programming
894 | Sockets
895 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/Project_Quora/spiders/topic/topic.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import codecs
  3 | import platform
  4 | import sys
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions as ec
  8 | from selenium import webdriver
  9 | from selenium.common.exceptions import NoSuchElementException
 10 | 
 11 | 
 12 | class Topic(object):
 13 |     # Arguments passed through the batch file topic.bat
 14 |     email, passw, url = sys.argv[1:]
 15 | 
 16 |     # Opening PhantomJS webdriver
 17 |     options = ['--proxy-type=none']
 18 |     if "Windows" == platform.system():
 19 |         driver = webdriver.PhantomJS('..\phantomjs.exe', service_args=options)
 20 |     else:
 21 |         driver = webdriver.PhantomJS(executable_path='../phantomjs',
 22 |                                      service_args=options)
 23 |     wait = WebDriverWait(driver, 60)
 24 | 
 25 |     # Access to Quora and Login
 26 |     driver.get("http://www.quora.com/")
 27 |     driver.refresh()
 28 |     time.sleep(2)
 29 | 
 30 |     print ('Login to Quora..')
 31 |     while True:
 32 |         # Entering your username and password
 33 |         form = driver.find_element_by_class_name('login')
 34 | 
 35 |         username = form.find_element_by_name('email')
 36 |         username.send_keys(email)
 37 |         time.sleep(2)
 38 |         password = form.find_element_by_name('password')
 39 |         password.send_keys(passw)
 40 | 
 41 |         time.sleep(2)
 42 |         form.find_element_by_xpath(
 43 |             ".//input[contains(@value, 'Login')]").click()
 44 |         time.sleep(2)
 45 | 
 46 |         try:
 47 |             if driver.find_element_by_css_selector(
 48 |                     'div[id*="_error"]').is_displayed():
 49 |                 driver.refresh()
 50 |                 print ('Login Error.Retry')
 51 |                 email = raw_input("Insert username: ")
 52 |                 passw = raw_input("Insert password: ")
 53 |         except NoSuchElementException:
 54 |             break
 55 | 
 56 |     # Open Section Organize of a Topic
 57 |     while True:
 58 |         try:
 59 |             driver.get(url)
 60 |             if driver.find_element_by_xpath(
 61 |                             '//div[contains(@class, "TopicNavigationChildTree' +
 62 |                             ' section_top")]').is_displayed():
 63 |                 break
 64 |         except Exception:
 65 |             print ('Error, page not avaible or wrong url')
 66 |             url = raw_input("Re-Insert URL-ORGANIZE_TOPIC:")
 67 | 
 68 |     filename = url.replace('https://www.quora.com/topic/', '')
 69 |     filename = filename.replace('/organize', '')
 70 |     filename += ".txt"
 71 |     target = codecs.open(filename, 'w+', encoding='utf-8')
 72 |     target.truncate()
 73 | 
 74 |     top = driver.find_element_by_xpath(
 75 |         '//div[contains(@class, "TopicNavigationChildTree section_top")]')
 76 |     topics = top.find_elements_by_xpath(
 77 |         './/span[contains(@class, "TopicNameSpan TopicName")]')
 78 |     show_more_list = top.find_elements_by_xpath(
 79 |         '//div[contains(@class, "TopicTreeItemToggled SimpleToggle Toggle")]' +
 80 |         '//small/span[not(contains(@class,"hidden"))]' +
 81 |         '/a[contains(text(), "Show ")]')
 82 | 
 83 |     # Expansion of the hierarchy of topics with Selenium
 84 |     while True:
 85 | 
 86 |         if len(show_more_list) > 0:
 87 | 
 88 |             for elem in show_more_list:
 89 |                 driver.execute_script("arguments[0].scrollIntoView(true);",
 90 |                                       elem)
 91 |                 driver.execute_script("window.scrollBy(0,-250);")
 92 |                 time.sleep(0.5)
 93 | 
 94 |                 # Click on "Show more" button
 95 |                 webdriver.ActionChains(driver).move_to_element(elem).click(
 96 |                     elem).perform()
 97 |                 wait.until(ec.invisibility_of_element_located(
 98 |                     (By.CLASS_NAME, 'loading')))
 99 | 
100 |                 while len(topics) == len(top.find_elements_by_xpath(
101 |                         './/span[contains(@class, "TopicNameSpan TopicName")]')):
102 |                     time.sleep(1)
103 |                 time.sleep(2)
104 | 
105 |                 print "Topic found: " + str(len(driver.find_elements_by_xpath(
106 |                     '//div[contains(@class, "TopicNavigationChildTree ' +
107 |                     'section_top")]//span[contains(@class, ' +
108 |                     '"TopicNameSpan TopicName")]')))
109 | 
110 |             show_more_list = top.find_elements_by_xpath(
111 |                 '//div[contains(@class, "TopicTreeItemToggled '
112 |                 'SimpleToggle Toggle")]//small/' +
113 |                 'span[not(contains(@class,"hidden"))]' +
114 |                 '/a[contains(text(), "Show ")]')
115 | 
116 |             print "Other " + str(len(show_more_list)) + " to expand"
117 |         else:
118 |             break
119 | 
120 |     topics = top.find_elements_by_xpath(
121 |         './/span[contains(@class, "TopicNameSpan TopicName")]')
122 |     topics_text = []
123 | 
124 |     print ('Please Wait..')
125 |     for topic in topics:
126 |         topics_text.append(topic.text.encode('ascii', 'ignore'))
127 | 
128 |     print ('Number of different Topic: ' + str(len(set(topics_text))))
129 | 
130 |     print ('Writing on file the list of Topic..')
131 |     for topic in set(topics_text):
132 |         target.write(topic + '\n')
133 | 
134 |     print ('Finish')
135 | 
136 |     target.close()
137 |     driver.close()
138 | 


--------------------------------------------------------------------------------
/quora/Project_Quora/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Project_Quora.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Project_Quora
12 | 


--------------------------------------------------------------------------------
/quora/README.md:
--------------------------------------------------------------------------------
 1 | # Quora Scraper
 2 | A python script for downloading questions and answers available on Quora and store in a database.
 3 | Specifically is focused to extraction of questions and answers of Quora's topic.
 4 |  
 5 | # How does it work
 6 | In this project there are two different script:
 7 | * `topic.py`
 8 |   The smaller part of the project, which allows the scraper to get the list of sub-topics in reference of a particular topic.
 9 |   In this way the scraper still remains into related topics, referring the starting quora topic.
10 |   For example see the section Organize of topic [Computer Programming Organize](https://www.quora.com/topic/Computer-Programming/organize) and its hierarchy topic. 
11 | * `quora.py`
12 |   More consistent than the previous script. It allows the parsing of questions and answers always remaining in the related topics.
13 |   It's based on Scrapy that makes requests for parsing question-threads and Selenium web driver framework for web automation mechanize.
14 |   By combining these two frameworks it is possible to obtain a large number of questions and answers, useful to study and analyze the contents of Quora.
15 |   
16 | # Installation
17 | 1. Download the content of this directory
18 | 2. Install all the requirements with: `pip install -r requirements.txt`
19 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip
20 | 4. Move `phantomjs.exe`(Windows) or `phantomjs`(OSX) into `spiders` directory 
21 | 
22 | # Getting Started
23 | 1. Start the first `topic.bat` louncher that takes like a parameter the url-organize of the topic. 
24 |    This louncher allows you to obtain, in a .txt file, a list of sub-topics about a certain topic.
25 | 2. Start the second `quora.bat` louncher that active scraping and allows to obtain a database and a json with all items. 
26 |    This louncher takes like a parameter the name of database in which to save the items extracted.
27 | 
28 |  Both script (topic.py and quora.py) to work need to be logged. Therefore be asked username and password of a Quora account when you  execute one of the two previous louncher. 
29 | 
30 | # Notes
31 | In the `topic` directory of this project there is already a list of related topics of [Computer Programming](https://www.quora.com/topic/Computer-Programming) in a .txt file. 
32 | So you may directly execute the `quora.bat`, to obtain a database of questions and answers related to Computer Programming Topic in Quora.
33 | As time passes, however, this list of sub-topic may be updated by Quora, so it would be useful to re-run `topic.bat` in the future.
34 | 


--------------------------------------------------------------------------------
/quora/quora.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | CLS
 3 | 
 4 | IF "%~1"=="-h" GOTO Help
 5 | 
 6 | :begin
 7 | ECHO 1.Help
 8 | ECHO 2.Insert Parameters
 9 | ECHO.
10 | 
11 | CHOICE /C 12 /M "Enter your choice:"
12 | 
13 | :: Note - list ERRORLEVELS in decreasing order
14 | IF ERRORLEVEL 2 GOTO Param
15 | IF ERRORLEVEL 1 GOTO Help
16 | 
17 | :Help
18 | ECHO List of Parameters:
19 | ECHO first:'EMAIL' related to Quora account
20 | ECHO second:'PASSWORD' related to Quora account
21 | ECHO third:'DB' choose a name for your database of items
22 | ECHO.
23 | GOTO begin
24 | 
25 | :Param
26 | SET /P EMAIL=Enter EMAIL:
27 | SET /P PASSW=Enter PASSWORD:
28 | SET /P DB=Enter database:
29 | 
30 | 
31 | cd Project_Quora
32 | cd Project_Quora
33 | cd spiders
34 | scrapy crawl quora -a database=%DB% -a email=%EMAIL% -a password=%PASSW%
35 | pause
36 | 
37 | 


--------------------------------------------------------------------------------
/quora/requirements.txt:
--------------------------------------------------------------------------------
 1 | cffi==1.2.1
 2 | characteristic==14.3.0
 3 | cryptography==1.0.1
 4 | cssselect==0.9.1
 5 | enum34==1.0.4
 6 | html2text==2015.6.21
 7 | idna==2.0
 8 | ipaddress==1.0.14
 9 | parsedatetime==1.5
10 | pyasn1==0.1.8
11 | pyasn1-modules==0.0.7
12 | pycparser==2.14
13 | PyDbLite==3.0.2
14 | pyOpenSSL==0.15.1
15 | pytz==2015.6
16 | pywin32==219
17 | queuelib==1.4.2
18 | Scrapy==1.0.3
19 | selenium==2.47.3
20 | service-identity==14.0.0
21 | six==1.9.0
22 | Twisted==15.4.0
23 | virtualenv==13.1.2
24 | w3lib==1.12.0
25 | wheel==0.26.0
26 | zope.interface==4.1.2
27 | 


--------------------------------------------------------------------------------
/quora/topic.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | CLS
 3 | 
 4 | IF "%~1"=="-h" GOTO Help
 5 | 
 6 | :begin
 7 | ECHO 1.Help
 8 | ECHO 2.Insert Parameters
 9 | ECHO.
10 | 
11 | CHOICE /C 12 /M "Enter your choice:"
12 | 
13 | 
14 | :: Note - list ERRORLEVELS in decreasing order
15 | IF ERRORLEVEL 2 GOTO Param
16 | IF ERRORLEVEL 1 GOTO Help
17 | 
18 | :Help
19 | ECHO List of Parameters:
20 | ECHO first:'EMAIL' related to Quora account
21 | ECHO second:'PASSWORD' related to Quora account
22 | ECHO third:'URL' url of Organize-Topic to obtain the list of topics (Example:https://www.quora.com/topic/Computer-Programming/organize)
23 | ECHO.
24 | GOTO begin
25 | 
26 | :Param
27 | SET /P EMAIL=Enter EMAIL:
28 | SET /P PASSW=Enter PASSWORD:
29 | SET /P URL=Enter URL-ORGANIZE_TOPIC (Example:https://www.quora.com/topic/Computer-Programming/organize):
30 | 
31 | cd Project_Quora
32 | cd Project_Quora
33 | cd spiders
34 | cd topic
35 | python topic.py %EMAIL% %PASSW% %URL%
36 | pause


--------------------------------------------------------------------------------
/scn/README.md:
--------------------------------------------------------------------------------
 1 | <img src="http://blogs-images.forbes.com/sap/files/2011/01/SCN_Logo473x128px.gif">
 2 | 
 3 | # SAP Community Network scraper
 4 | --------
 5 | An implementation of a scraper that extracts items from each permissible discussion of SCN platform by scanning each page of ["ABAP Development"](http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]) category. 
 6 | 
 7 | Because of the problems caused to uploads of several contents, the software is subject to errors caused by loading page. 
 8 | Therefore it was thought to implement a mechanism for saving the state of execution, to retrieve it again from where it stopped.
 9 | 
10 | ### Version
11 | 2.0
12 | 
13 | ### How does it work 
14 | There is one main script that contains the core of the scraper:
15 | - `scraper.py`
16 | 
17 | and there are 2 support script:
18 | - `main.py`
19 | - `dataStoring.py`
20 | 
21 | ##### `scraper.py`
22 | It takes by input the `STARTURL` and using [Selenium](http://www.seleniumhq.org/) support, it run three phases:
23 | * verify that the content of the page (threads) have been loaded, otherwise it refresh the page until the content have been loaded;
24 | * takes the number of link that need to be considered, by escluding link of discussions that are not
25 |  marked as 'answered' or 'not answered' and link of discussions that may raise problems;
26 | * For each discussion in the page, it extract all the questions and answers and it memorizes them in a structure;
27 | 
28 | ##### `main.py`
29 | The program starts from this script that read from a file the `PAGE INDEX` to start the scraping process;
30 |  in a first execution the program starts from page 2 and, for each page, update the index file with the current `PAGE INDEX`, 
31 |  in subsequent executions it load the `PAGE INDEX` from index file and starts from the last page.
32 | 
33 | After loading the current state of execution, it defines the `STARTURL`, based on `PAGE INDEX`, to pass the scraper. 
34 | After calling the scraper it save the threads extracted into a ".json" file and into a "pdl" ([PyDbLite](http://www.pydblite.net/en/)) file, and repeat the process.
35 | 
36 | ##### `dataStoring.py`
37 | It provides mechanisms to store the data extracted into ".json" and "pdl" ([PyDbLite](http://www.pydblite.net/en/)) file without overwriting the existing content,
38 |  and to read and update the "index.txt" file containing the `PAGE INDEX`. 
39 |  
40 | ### Installation
41 | 1. Download the content of this directory
42 | 2. Install all the requirements with: `pip install -r requirements.txt`
43 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip
44 | 4. Move `phantomjs.exe`(Windows) or `phantomjs`(OSX) into `scnscraper` directory 
45 | 
46 | ### Getting Started
47 | To start the software you need to execute `Run.bat` file, into the main directory. It provides 2 alternative of execution:
48 | 
49 | * New Execution, to start a new scraping process or to start again the execution. You need to be waiting to not press this command after the data extraction,
50 |  because it delete the output files that contains the thread exctracted. 
51 | * Resume Execution, it resumes the execution from where it left off in the last run.
52 | 
53 | ### Endnotes
54 | SAP Community Network have many problems from the point of view of scraping process.
55 | 
56 | - It is a very slow platform in loading discussions from server db, that cause continuous loop of refresh until content loading;
57 | - It is a very wealthy site that contains a lot of scripts and content that slows the loading of web pages, 
58 | causing not finding of content by selectors and then exceptions running.
59 | 
60 | For this reason, the program ends often run with errors and it was thought to implement the saving and loading process, 
61 | to start again the execution from the last step. For an optimal execution we suggest a good Internet speed. 
62 | 
63 | *... HAPPY SCRAPING!*
64 | 


--------------------------------------------------------------------------------
/scn/RUN.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | CLS
 3 | 
 4 | IF "%~1"=="-h" GOTO Help
 5 | 
 6 | :begin
 7 | ECHO.
 8 | ECHO.
 9 | ECHO ---- SCN Scraper ----
10 | ECHO.
11 | ECHO 1. NEW EXECUTION
12 | ECHO 2. RESUME EXECUTION
13 | ECHO 3. HELP
14 | ECHO.
15 | CHOICE /C 123 /M "Enter your choice: "
16 | 
17 | :: Note - list ERRORLEVELS in decreasing order
18 | IF ERRORLEVEL 3 GOTO Help
19 | IF ERRORLEVEL 2 GOTO Resume
20 | IF ERRORLEVEL 1 GOTO New
21 | 
22 | :Help
23 | ECHO.
24 | ECHO -- HELP --
25 | ECHO.
26 | ECHO - If you would begin a new scraping process, press [1]
27 | ECHO.
28 | ECHO - If you want to delete the saved data of a previous execution beginning a new one, press [1]
29 | ECHO.
30 | ECHO - If you want to load a previous execution from the last page scraped, press [2].
31 | ECHO.
32 | pause
33 | GOTO begin
34 | 
35 | :Resume
36 | c:\python27\python.exe "%~dp0scnscraper\main.py" %*
37 | 
38 | :New
39 | if exist "%~dp0scnscraper\abap.pydb" (
40 | del "%~dp0scnscraper\abap.pydb"
41 | del "%~dp0scnscraper\abap.json"
42 | del "%~dp0scnscraper\index.txt" )
43 | :: Edit index file with start URL PAGE
44 | c:\python27\python.exe "%~dp0scnscraper\main.py" %*
45 | PAUSE
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/scn/discretizer/RUN.bat:
--------------------------------------------------------------------------------
1 | 
2 | c:\python27\python.exe "%~dp0scn_discretizer.py" all
3 | pause
4 | 


--------------------------------------------------------------------------------
/scn/discretizer/discretization.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Compatible with Python 2 and Python 3
  3 | """
  4 | 
  5 | import csv
  6 | import logging
  7 | import os
  8 | import re
  9 | from math import log
 10 | 
 11 | from dateutil.parser import parse as parse_date
 12 | from nltk import FreqDist
 13 | from nltk import PorterStemmer
 14 | from nltk import tokenize
 15 | from nltk import word_tokenize
 16 | from pydblite.pydblite import Base
 17 | 
 18 | 
 19 | class Discretizer:
 20 |     logging.basicConfig(level=logging.DEBUG)
 21 |     logger = logging.getLogger(__name__)
 22 |     linesep = '\n'
 23 | 
 24 |     def __init__(self, db_name, db_files):
 25 |         self.db_name = db_name
 26 |         self.db_files = db_files
 27 |         self.db = dict()
 28 | 
 29 |     def log(self, msg, level=logging.DEBUG):
 30 |         self.logger.log(level, msg)
 31 | 
 32 |     def load_db(self, check=True, fix=False, save_to_file=False):
 33 |         self.log('Opening {0} database(s)'.format(len(self.db_files)), logging.INFO)
 34 |         for db_name, db_file in self.db_files.items():
 35 |             _db = Base(db_file, save_to_file=save_to_file)
 36 |             _db.open()
 37 |             self.log('Database {0} opened, records #: {1}'.format(db_name, len(_db)), logging.DEBUG)
 38 |             self.db.update({db_name: _db})
 39 |             _db.create_index('uid')
 40 |             _db.create_index('type')
 41 |             self.log("Db {0}: printing simple strawman prediction accuracy for answers with max upvotes as best answer:".format(db_name), logging.INFO)
 42 |             self._strawman(_db)
 43 |         if check is True:
 44 |             self.check_db(fix)
 45 | 
 46 |     """
 47 |     * fix answers_count with actual # of answers exported
 48 |     * if an answer has tag != N/A, the tags must be applied to the question in the same thread
 49 |     * if a question is marked as resolved True, then one of the answers in the thread must have been marked as solution;
 50 |     and viceversa;
 51 |     * check if Q or A text is ''
 52 |     * turn question uid from int to unicode string
 53 |     """
 54 | 
 55 |     def check_db(self, fix=False):
 56 |         self.log('Checking consistency for databases.', logging.INFO)
 57 |         for name, _db in self.db.items():
 58 |             for question in _db._type['Question']:
 59 |                 expected_answers_count = int(question['answers'])
 60 |                 actual_answers_count = 0
 61 |                 for i in range(1, expected_answers_count + 1):
 62 |                     try:
 63 |                         _db._uid[question['uid'][:-1] + str(i)][0]
 64 |                         actual_answers_count += 1
 65 |                     except IndexError:
 66 |                         break
 67 |                 if actual_answers_count < expected_answers_count:
 68 |                     self.log('Fixing answers count mismatch in thread id {0}, expected {1}, found {2}'.
 69 |                              format(question['uid'], expected_answers_count, actual_answers_count))
 70 |                     _db.update(question, answers=actual_answers_count)
 71 | 
 72 |             for record in (_db('text') == ''):
 73 |                 self.log('Warning on record {0} from db {1}: empty text!'.format(record['uid'], name),
 74 |                          logging.WARNING)
 75 | 
 76 |             for record in (_db('type') == 'Answer') & (_db('tags') != 'null'):
 77 |                 self.log('Warning on record {0} from db {1}: tags in answer!'.format(record['uid'], name),
 78 |                          logging.WARNING)
 79 |                 question_uid = record['uid'].split('.')[0]
 80 |                 question = _db._uid[question_uid][0]
 81 |                 question_tags = question['tags'] + '.' + record['tags']
 82 |                 _db.update(question, tags=question_tags)
 83 | 
 84 |             if fix is True:
 85 |                 _db.commit()
 86 | 
 87 |     def load_threads(self):
 88 |         self.log('Loading threads from {0} db(s)'.format(len(self.db_files)), logging.INFO)
 89 |         overall_threads = list()
 90 |         for name, _db in self.db.items():
 91 |             db_threads = list()
 92 |             questions = _db._type['Question']  # use db index
 93 |             self.log('Loaded {0} questions (threads) from db {1}, attaching answers...'.format(len(questions), name),
 94 |                      logging.DEBUG)
 95 |             for question in questions:
 96 |                 answers = self._get_answers(question['uid'], int(question['answers']), _db)
 97 |                 db_threads.append({'question': question, 'question_uid': question['uid'],
 98 |                                    'date_time': question['date_time'], 'answers_count': question['answers'],
 99 |                                    'resolved': question['resolve'], 'tags': question['tags'], 'answers': answers})
100 | 
101 |             overall_threads.extend(db_threads)
102 |         self.log('Overall threads loaded: {0} from {1} database(s)'.format(len(overall_threads), len(self.db_files)))
103 |         return overall_threads
104 | 
105 |     def _get_answers(self, question_id, answers_count, _db):
106 |         self.log('Getting {0} answers for thread id {1}'.format(answers_count, question_id), logging.DEBUG)
107 |         answers = list()
108 |         if answers_count > 0:
109 |             for i in range(1, answers_count + 1):
110 |                 answer_id = str(question_id)[:-1] + str(i)
111 |                 for answer in (_db._uid[answer_id]):  # use index
112 |                     answers.append(answer)
113 |             if answers_count != len(answers):
114 |                 self.log('Warning in thread id {0}: loaded {1} answers, expected {2}. Please, run a check db with '
115 |                          'fix=True'.format(question_id, len(answers), answers_count),
116 |                          logging.WARNING)
117 |         return answers
118 | 
119 |     @staticmethod
120 |     def _strawman(_db):
121 |         # assumes index on uid already exists
122 |         # db.create_index('uid')
123 |         questions_with_answers = (_db("type") == 'Question') & (_db("answers") > 0)
124 |         a = 0
125 |         b = 0
126 |         c = 0
127 |         d = 0
128 | 
129 |         total_answer_count = 0
130 |         for q in questions_with_answers:
131 |             thread_answers = list()
132 |             answers_count = int(q['answers'])
133 |             total_answer_count += answers_count
134 |             if answers_count > 0:
135 |                 for i in range(1, answers_count + 1):
136 |                     answer_id = q['uid'][:-1] + str(i)
137 |                     for answer in (_db._uid[answer_id]):  # use index
138 |                         print(answer_id)
139 |                         thread_answers.append(answer)
140 |             # compute upvotes
141 |             max_upvote = 0
142 |             for answer in thread_answers:
143 |                 if (answer['upvotes'] == '---'):
144 |                     count = 0
145 |                 else:
146 |                     count = int(answer['upvotes'])
147 |                 if count > max_upvote:
148 |                     max_upvote = count
149 | 
150 |             output = list()
151 |             prediction = None
152 |             for answer in thread_answers:
153 |                 if (answer['upvotes'] == '---'):
154 |                     count = 0
155 |                 else:
156 |                     count = int(answer['upvotes'])
157 |                 if count == max_upvote:
158 |                     prediction = 'solution'
159 |                 else:
160 |                     prediction = ''
161 | 
162 |                 output.append((answer['uid'], answer['resolve'], prediction))
163 |                 if prediction == 'solution' and answer['resolve'] == 'solution':
164 |                     a += 1
165 |                 if prediction == '' and answer['resolve'] == '---':
166 |                     b += 1
167 |                 if prediction == 'solution' and answer['resolve'] == '---':
168 |                     c += 1
169 |                 if prediction == '' and answer['resolve'] == 'solution':
170 |                     d += 1
171 | 
172 |         print(_db.name)
173 |         print("a = {0} | b = {1}\nc = {2} | d = {3}".format(a, b, c, d))
174 |         print("Total answers %s" % total_answer_count)
175 |         print("Accuracy {0}".format((float(a + b) / float(total_answer_count))))
176 | 
177 |     def compute_features(self, threads, stemmed_vocabulary, distrib_matrix):
178 |         self.log('Computing features. Please, wait. This will take some serious time...', logging.INFO)
179 |         for thread in threads:
180 |             self.log('Computing features for thread id {0}'.format(thread['question_uid']), logging.INFO)
181 |             try:
182 |                 base_date = parse_date(thread['date_time'])
183 |             except ValueError:
184 |                 base_date = parse_date('1970-01-01')
185 |             except AttributeError:
186 |                 base_date = thread['date_time']
187 |             answers = thread['answers']
188 |             try:
189 |                 tag_list = thread['tags'].split('.')
190 |             except AttributeError:
191 |                 tag_list = thread['tags']  # there is no '.' used as tag separator
192 |             if '' in tag_list:
193 |                 tag_list.remove('')
194 |             for answer in answers:
195 |                 # compute thread tags
196 |                 answer_tags = answer['tags'].split()
197 |                 if 'null' in answer_tags:
198 |                     answer_tags.remove('null')
199 |                 tag_list.extend(answer_tags)
200 |                 thread['tags'] = sorted(set(tag_list))
201 | 
202 |                 # compute len in chars and words
203 |                 alen = len(answer['text'])
204 |                 answer['len'] = alen
205 |                 wordcount = Discretizer._count_words(answer['text'])
206 |                 answer['wordcount'] = wordcount
207 |                 if wordcount == 0:
208 |                     answer['avg_chars_per_word'] = 0
209 |                 else:
210 |                     answer['avg_chars_per_word'] = "{0:.2f}".format(alen / float(wordcount))  # float with 2 decimals
211 |                 try:
212 |                     sentences = tokenize.sent_tokenize(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
213 |                                                        language='english')
214 |                 except (AttributeError, TypeError) as e:
215 |                     sentences = tokenize.sent_tokenize(str(answer['text']), language='english')
216 |                 sentence_count = len(sentences)
217 |                 answer['sentences'] = sentence_count
218 |                 if sentence_count == 0:
219 |                     words_per_sentence = 0
220 |                 else:
221 |                     words_per_sentence = "{0:.2f}".format(wordcount / float(sentence_count))
222 |                 answer['avg_words_per_sentence'] = words_per_sentence
223 |                 longest_sentence = 0
224 |                 for s in sentences:
225 |                     l = Discretizer._count_words(s)
226 |                     if l > longest_sentence:
227 |                         longest_sentence = l
228 |                 answer['longest_sentence'] = longest_sentence
229 |                 try:
230 |                     creation_date = parse_date(answer['date_time'])
231 |                 except AttributeError:
232 |                     creation_date = answer['date_time']
233 |                 except Exception:
234 |                     print('\nInvalid date_time')
235 |                 time_difference = abs((creation_date - base_date).total_seconds())
236 |                 answer['time_difference'] = time_difference
237 | 
238 |                 #answer['upvotes'] = thread['upvotes']
239 | 
240 |                 # check for urls and code snippets
241 |                 match = re.search(r'http(s)?://', str(answer['text']), re.MULTILINE)
242 |                 if match:
243 |                     answer['has_links'] = True
244 |                 else:
245 |                     answer['has_links'] = False
246 | 
247 |                 answer['has_code_snippet'] = self._has_codesnippet(str(answer['text']))
248 |                 try:
249 |                     LL = Discretizer._log_likelihood(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
250 |                                              stemmed_vocabulary, distrib_matrix)
251 |                 except (AttributeError, TypeError) as e:
252 |                     LL = Discretizer._log_likelihood(str(answer['text']), stemmed_vocabulary, distrib_matrix)
253 |                 answer['loglikelihood'] = LL
254 |                 answer['loglikelihood_descending'] = LL
255 |                 answer['loglikelihood_ascending'] = LL
256 |                 try:
257 |                      aspw = Discretizer._ASPW(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'))
258 |                 except (AttributeError, TypeError) as e:
259 |                      aspw = Discretizer._ASPW(str(answer['text']))
260 |                 fk = Discretizer._FK(answer['avg_words_per_sentence'], aspw)
261 |                 answer['F-K'] = fk
262 |                 answer['F-K_descending'] = fk
263 |                 answer['F-K_ascending'] = fk
264 | 
265 |             # compute ranks
266 |             #answers = Discretizer._sort_rank(answers, 'upvotes', reverse=True)
267 |             answers = Discretizer._sort_rank(answers, 'sentences', reverse=True)
268 |             answers = Discretizer._sort_rank(answers, 'len', reverse=True)
269 |             answers = Discretizer._sort_rank(answers, 'views', reverse=True)
270 |             answers = Discretizer._sort_rank(answers, 'wordcount', reverse=True)
271 |             answers = Discretizer._sort_rank(answers, 'avg_chars_per_word', reverse=True)
272 |             answers = Discretizer._sort_rank(answers, 'avg_words_per_sentence', reverse=True)
273 |             answers = Discretizer._sort_rank(answers, 'longest_sentence', reverse=True)
274 |             answers = Discretizer._sort_rank(answers, 'time_difference', reverse=False)
275 |             answers = Discretizer._sort_rank(answers, 'loglikelihood_descending', reverse=True)
276 |             answers = Discretizer._sort_rank(answers, 'loglikelihood_ascending', reverse=False)
277 |             answers = Discretizer._sort_rank(answers, 'F-K_descending', reverse=True)
278 |             answers = Discretizer._sort_rank(answers, 'F-K_ascending', reverse=False)
279 |             thread['answers'] = answers
280 | 
281 |         self.log('Done computing features for {0} threads'.format(len(threads)), logging.INFO)
282 |         return threads
283 | 
284 |     @staticmethod
285 |     def _ASPW(text):
286 |         aspw = 0
287 |         for word in text.split():
288 |             s = Discretizer._count_syllables(word)
289 |             aspw += s
290 |         return aspw
291 | 
292 |     @staticmethod
293 |     def _count_syllables(word):
294 |         vowels = ['a', 'e', 'i', 'o', 'u', 'y']
295 |         currentWord = list(word)
296 |         numVowels = 0
297 |         lastWasVowel = False
298 |         for wc in currentWord:
299 |             foundVowel = False
300 |             for v in vowels:
301 |                 # don't count diphthongs
302 |                 if (v == wc) and lastWasVowel is True:
303 |                     foundVowel = True
304 |                     lastWasVowel = True
305 |                     break
306 |                 elif (v == wc) and lastWasVowel is False:
307 |                     numVowels += 1
308 |                     foundVowel = True
309 |                     lastWasVowel = True
310 |                     break
311 | 
312 |             # If full cycle and no vowel found, set lastWasVowel to false;
313 |             if not foundVowel:
314 |                 lastWasVowel = False
315 | 
316 |         # Remove es, it's _usually? silent
317 |         if (len(word) > 2) and (word[len(word)-2:] == "es"):
318 |             numVowels -= 1
319 |         # remove silent e
320 |         elif (len(word) > 1) and (word[len(word)-1:] == "e"):
321 |             numVowels -= 1
322 |         return numVowels
323 | 
324 |     @staticmethod
325 |     def _FK(awps, asps):
326 |         fk = (0.39 * float(awps)) + (11.8 * float(asps)) - 15.59
327 |         return fk
328 | 
329 |     @staticmethod
330 |     def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
331 |         LL = 0
332 |         if answer_text is not '':
333 |             tokens = word_tokenize(str(answer_text), language='english')
334 |             porter_stemmer = PorterStemmer()
335 |             unique_wordcount = len(stemmed_vocabulary)
336 |             """
337 |             per ogni w unica print_function words
338 |                 Cw = conta w in answer_text
339 |                 PwM = self.distrib_matrix[stemmer(w)]
340 |                 unique_wordcount = len(tokenize(answer_text)
341 |             """
342 |             for w in tokens:
343 |                 _w = w.strip().lower()
344 |                 Cw = 0
345 |                 for _ in answer_text.split():
346 |                     if _w == _.strip().lower():
347 |                         Cw += 1
348 | 
349 |                 try:
350 |                     w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
351 |                 except AttributeError:
352 |                     w_stem = porter_stemmer.stem(_w)
353 |                 try:
354 |                     PwM = distrib_matrix[w_stem]
355 |                 except KeyError:  # key error means frequency is equal to cutoff point 1
356 |                     PwM = 1
357 |                 LL += (Cw * log(float(PwM)))
358 | 
359 |             try:
360 |                 LL = "{0:.2f}".format(LL / float(unique_wordcount))
361 |             except ZeroDivisionError:
362 |                 LL = 0 
363 | 
364 |         return LL
365 | 
366 |     @staticmethod
367 |     def _count_words(text):
368 |         wordcount = 0
369 |         for word in text.split():
370 |             wordcount += 1
371 |         return wordcount
372 | 
373 |     @staticmethod
374 |     def _sort_rank(answers, key, reverse=True):
375 |         try:
376 |             new_list = sorted(answers, key=lambda x: float(x[key]), reverse=reverse)
377 |             ranks = dict()
378 |             for i in range(0, len(answers)):
379 |                 ranks[new_list[i]['uid']] = i + 1
380 | 
381 |             # fix rank ties
382 |             for i in range(0, len(answers)-1):
383 |                 if new_list[i][key] == new_list[i+1][key]:
384 |                     ranks[new_list[i+1]['uid']] = ranks[new_list[i]['uid']]
385 | 
386 |             for k, v in ranks.items():
387 |                 for a in answers:
388 |                     if a['uid'] == k:
389 |                         a['{0}_rank'.format(key)] = v
390 |         except ValueError as e:
391 |             logging.log(level=logging.ERROR, msg="Error computing rank for feature %s" % key)
392 |             pass
393 | 
394 |         return answers
395 | 
396 |     def _has_codesnippet(self, text):
397 |         code = False
398 |         if re.search(r'({|}| package |\.jar| class | namespace |exception |<<| end | def |<\?php| soap | <xml| wsdl |\.cs|\.java|\.php|\.rb|lambda)',
399 |                      text, re.MULTILINE | re.IGNORECASE):
400 |                 code = True
401 |         return code
402 | 
403 |     def build_vocabulary(self, stems):
404 |         vocabulary_filename = '{0}_vocabulary.txt'.format(self.db_name)
405 |         if os.path.isfile(vocabulary_filename):  # load vocabulary from file
406 |             words = list()
407 |             self.log('Loading existing community vocabulary from {0}'.format(vocabulary_filename), logging.INFO)
408 |             with open(vocabulary_filename, 'rt') as f:
409 |                 for word in f:
410 |                     words.append(word.strip())
411 |                 f.close()
412 |             vocabulary = sorted(set(words))
413 |         else:  # create vocabulary and save file
414 |             self.log('Creating new vocabulary into {0}. Please wait, this may take some time.'.
415 |                      format(vocabulary_filename), logging.INFO)
416 |             vocabulary = sorted(set(stems))
417 |             with open(vocabulary_filename, 'wt') as f:
418 |                 for lemma in vocabulary:
419 |                     f.write('{0}{1}'.format(lemma, self.linesep))
420 |                 f.close()
421 |         return vocabulary
422 | 
423 |     def build_corpus(self):
424 |         corpus_filename = '{0}_corpus.txt'.format(self.db_name)
425 |         corpus = list()
426 |         if os.path.isfile(corpus_filename):  # load corpus from file
427 |             self.log('Loading existing corpus from {0}'.format(corpus_filename), logging.INFO)
428 |             with open(corpus_filename, 'rt') as f:
429 |                 for word in f:
430 |                     corpus.append(word.strip())
431 |                 f.close()
432 |         else:
433 |             self.log('Creating corpus from {0} database(s). Please wait, this may take some time.'.format(
434 |                 len(self.db_files)), logging.INFO)
435 |             with open(corpus_filename, 'wt') as f:
436 |                 for name, _db in self.db.items():
437 |                     self.log('Updating corpus from db {0}.'.format(name), logging.DEBUG)
438 |                     for record in _db:
439 |                         try:
440 |                             tokens = word_tokenize(record['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
441 |                                                    language='english')
442 |                         except (AttributeError, TypeError) as e:
443 |                             tokens = word_tokenize(str(record['text']), language='english')
444 |                         for t in tokens:
445 |                             corpus.append(t)
446 |                             f.write('{0}{1}'.format(t, self.linesep))
447 |                 f.close()
448 |         return corpus
449 | 
450 |     def build_stems(self, corpus):
451 |         stems_filename = '{0}_stems.txt'.format(self.db_name)
452 |         if os.path.isfile(stems_filename):  # load stems from file
453 |             stems = list()
454 |             self.log('Loading existing stems from {0}'.format(stems_filename), logging.INFO)
455 |             with open(stems_filename, 'rt') as f:
456 |                 for stem in f:
457 |                     stems.append(stem.strip())
458 |                 f.close()
459 |         else:
460 |             self.log('Creating stems from corpus into {0}. Please wait, this may take some time.'.format(
461 |                 stems_filename), logging.INFO)
462 |             porter_stemmer = PorterStemmer()
463 |             try:
464 |                 stems = [porter_stemmer.stem(token.lower().decode('utf-8', 'replace').encode('ascii', 'replace'))
465 |                          for token in corpus]
466 |             except (AttributeError, TypeError) as e:
467 |                 stems = [porter_stemmer.stem(token.lower())
468 |                          for token in corpus]
469 |             with open(stems_filename, 'wt') as f:
470 |                 for stem in stems:
471 |                     f.write('{0}{1}'.format(stem, self.linesep))
472 |                 f.close()
473 |         return stems
474 | 
475 |     def build_distribution_matrix(self, stems):
476 |         distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name)
477 |         if os.path.isfile(distrib_matrix_filename):  # load matrix from file
478 |             self.log('Loading existing distribution matrix from {0}'.format(distrib_matrix_filename), logging.INFO)
479 |             distrib_matrix = dict()
480 |             with open(distrib_matrix_filename, 'rt') as f:
481 |                 csvrreader = csv.DictReader(f, delimiter=' ', lineterminator=self.linesep)
482 |                 for row in csvrreader:
483 |                     distrib_matrix.update({row['w']: row['P(w|M)']})
484 |                 f.close()
485 |         else:  # create matrix and save file
486 |             self.log('Creating new distribution matrix into {0}. Please wait, this may take some time'.
487 |                      format(distrib_matrix_filename), logging.INFO)
488 |             distrib_matrix = FreqDist(stems)
489 | 
490 |             with open(distrib_matrix_filename, 'wt') as f:
491 |                 writer = csv.DictWriter(f, fieldnames=['w', 'P(w|M)'], delimiter=' ', lineterminator=self.linesep)
492 |                 writer.writeheader()
493 |                 for k in distrib_matrix.keys():
494 |                     writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]})
495 |                 f.close()
496 | 
497 |         distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix, cutoff=1)
498 |         return distrib_matrix
499 | 
500 |     @staticmethod
501 |     def reduce_distribution_matrix(matrix, cutoff=1):
502 |         reduced = {key: value for key, value in matrix.items() if int(value) > cutoff}
503 |         return reduced
504 | 
505 |     def save_csv(self, threads):
506 |         fout = '{0}_features.csv'.format(self.db_name)
507 |         self.log('Saving features into {0}'.format(fout), logging.INFO)
508 |         csvf = open(fout, 'wt')
509 |         fields = ('resolved', 'question_uid', 'answers_count', 'answer_uid',
510 |                   'date_time', 'time_difference', 'time_difference_rank', 'solution', 'len', 'len_rank', 'wordcount',
511 |                   'wordcount_rank', 'avg_chars_per_word', 'avg_chars_per_word_rank', 'sentences', 'sentences_rank',
512 |                   'avg_words_per_sentence', 'avg_words_per_sentence_rank', 'longest_sentence', 'longest_sentence_rank',
513 |                   'views', 'views_rank', 'loglikelihood', 'loglikelihood_ascending_rank',
514 |                   'loglikelihood_descending_rank', 'F-K', 'F-K_ascending_rank', 'F-K_descending_rank', 'upvotes',
515 |                   'upvotes_rank', 'has_links', 'has_code_snippet', 'has_tags')
516 |         writer = csv.DictWriter(csvf, dialect=csv.excel, fieldnames=fields, delimiter=',', lineterminator=self.linesep)
517 |         writer.writeheader()
518 |         # empty_line = dict.fromkeys(fields)
519 |         for t in threads:
520 |             row = dict()
521 |             row.fromkeys(fields)
522 |             answers = t['answers']
523 |             # question with no answers are excluded
524 |             i = 0
525 |             for a in answers:
526 |                 i += 1
527 |                 if i == 1:
528 |                     row['resolved'] = t['resolved']
529 |                     row['question_uid'] = t['question_uid']
530 |                     if len(t['tags']) > 0:
531 |                         row['has_tags'] = True
532 |                     else:
533 |                         row['has_tags'] = False
534 |                 else:
535 |                     row['resolved'] = ''
536 |                     row['question_uid'] = ''
537 |                 row['answers_count'] = t['answers_count']
538 |                 row['answer_uid'] = a['uid']
539 |                 row['time_difference'] = a['time_difference']
540 |                 row['time_difference_rank'] = a['time_difference_rank']
541 |                 if a['resolve'] == 'solution':
542 |                     row['solution'] = True
543 |                 else:
544 |                     row['solution'] = False
545 |                 row['len'] = a['len']
546 |                 row['len_rank'] = a['len_rank']
547 |                 row['wordcount'] = a['wordcount']
548 |                 row['wordcount_rank'] = a['wordcount_rank']
549 |                 row['avg_chars_per_word'] = a['avg_chars_per_word']
550 |                 row['avg_chars_per_word_rank'] = a['avg_chars_per_word_rank']
551 |                 row['sentences'] = a['sentences']
552 |                 row['sentences_rank'] = a['sentences_rank']
553 |                 row['avg_words_per_sentence'] = a['avg_words_per_sentence']
554 |                 row['avg_words_per_sentence_rank'] = a['avg_words_per_sentence_rank']
555 |                 row['longest_sentence'] = a['longest_sentence']
556 |                 row['longest_sentence_rank'] = a['longest_sentence_rank']
557 |                 row['views'] = a['views']
558 |                 try:
559 |                     row['views_rank'] = a['views_rank']
560 |                 except KeyError:
561 |                     pass
562 |                 row['loglikelihood'] = a['loglikelihood']
563 |                 row['loglikelihood_descending_rank'] = a['loglikelihood_descending_rank']
564 |                 row['loglikelihood_ascending_rank'] = a['loglikelihood_ascending_rank']
565 |                 row['F-K'] = a['F-K']
566 |                 row['F-K_descending_rank'] = a['F-K_descending_rank']
567 |                 row['F-K_ascending_rank'] = a['F-K_ascending_rank']
568 |                 row['upvotes'] = a['upvotes']
569 |                 #row['upvotes_rank'] = a['upvotes_rank']
570 |                 row['has_links'] = a['has_links']
571 |                 row['has_code_snippet'] = a['has_code_snippet']
572 |                 row['date_time'] = a['date_time']
573 |                 writer.writerow(row)
574 |             #writer.writerow(empty_line)
575 |         csvf.close()
576 | 


--------------------------------------------------------------------------------
/scn/discretizer/scn_discretizer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from discretization import Discretizer
 5 | 
 6 | 
 7 | def main():
 8 |     program_name = os.path.basename(sys.argv[0])
 9 |     db_files = {'abap': 'abap.pydb'}
10 |     try:
11 |         db_names = sys.argv[1]
12 |     except IndexError:
13 |         raise Exception('No db name. Please, re-run as "{0} dbname.pydb"'.format(program_name))
14 | 
15 |     if db_names == 'all':
16 |         discretizer = Discretizer(db_names, db_files)
17 |     else:
18 |         try:
19 |             discretizer = Discretizer(db_names, {db_names: db_files.get(db_names)})
20 |         except KeyError:
21 |             raise Exception('Invalid db name {0}. Please, check the name and re-run.'.format(db_names))
22 | 
23 |     discretizer.load_db(check=False, fix=False, save_to_file=False)
24 | 
25 |     corpus = discretizer.build_corpus()
26 |     stems = discretizer.build_stems(corpus)
27 |     stemmed_vocabulary = discretizer.build_vocabulary(stems)
28 |     distib_matrix = discretizer.build_distribution_matrix(stems)
29 | 
30 |     # grouping
31 |     threads = discretizer.load_threads()
32 |     # discretization and sorting
33 |     threads = discretizer.compute_features(threads, stemmed_vocabulary, distib_matrix)
34 |     discretizer.save_csv(threads)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     sys.exit(main())
39 | 


--------------------------------------------------------------------------------
/scn/requirements.txt:
--------------------------------------------------------------------------------
 1 | BeautifulSoup==3.2.1
 2 | cffi==1.2.1
 3 | characteristic==14.3.0
 4 | cryptography==1.0.2
 5 | cssselect==0.9.1
 6 | enum34==1.0.4
 7 | html2text==2015.6.21
 8 | idna==2.0
 9 | ipaddress==1.0.14
10 | lxml==3.4.4
11 | pyasn1==0.1.9
12 | pyasn1-modules==0.0.8
13 | pycparser==2.14
14 | PyDbLite==3.0.2
15 | pyOpenSSL==0.15.1
16 | queuelib==1.4.2
17 | Scrapy==1.0.3
18 | selenium==2.48.0
19 | service-identity==14.0.0
20 | six==1.10.0
21 | Twisted==15.4.0
22 | w3lib==1.12.0
23 | wheel==0.24.0
24 | zope.interface==4.1.3
25 | 


--------------------------------------------------------------------------------
/scn/scnscraper/dataStoring.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Salvatore Cassano'
  2 | 
  3 | from pydblite.pydblite import Base
  4 | from items import SapItem
  5 | import re
  6 | import os.path
  7 | 
  8 | class DataStoring():
  9 | 
 10 |     #Inizialize an instantiated object by opening json file and the database
 11 |     def __init__(self):
 12 |         self.out_file = open("scnscraper/abap.json", "a")
 13 |         self.out_file.close()
 14 |         self.db = Base("scnscraper/abap.pydb")
 15 |         if self.db.exists():
 16 |             self.db.open()
 17 |         else:
 18 |             self.db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags',
 19 |                            'views', 'answers', 'resolve', 'upvotes', 'text')
 20 | 
 21 |     #for each thread scraped, insert it into db
 22 |     def insert_items_into_db(self, threads):
 23 |             for thread in threads:
 24 |                 item = SapItem() # New Item instance
 25 |                 item = thread
 26 |                 try:
 27 |                     # Insert into db
 28 |                     self.db.insert(url = str(item["url"]), uid = str(item["uid"]), type= str(item["type"] ),
 29 |                                    author=str(item["author"]), title = str(item["title"]),
 30 |                                    date_time = str(item["date_time"] ),tags = str(item["tags"] ),
 31 |                                    views = str(item["views"] ), answers = str(item["answers"] ),
 32 |                                    resolve = str(item["resolve"] ), upvotes = str(item["upvotes"] ),
 33 |                                    text = str(item["text"]))
 34 |                 except UnicodeEncodeError:
 35 |                     print("Unicode Encode Exception!")
 36 |             #save changes on disk
 37 |             self.db.commit()
 38 | 
 39 |     # for each thread scraped, initialize the string to insert into json file
 40 |     def threads_to_str(self, threads):
 41 |         out_string = "[ "
 42 |         if threads.__len__() == 0:
 43 |             return ""
 44 |         for thread in threads:
 45 |             item = SapItem()
 46 |             item = thread
 47 |             try:
 48 |                 out_string += "{ url: '" + str(item["url"] ) + "', " + "uid: '" + str(item["uid"] ) + "', "\
 49 |                                 "type: '" + str(item["type"] )  + "', "\
 50 |                                 "author: '"+ str(item["author"])  + "', "  \
 51 |                                 "title: '"+ str(item["title"])  + "', "\
 52 |                                 "date_time: '"+ str(item["date_time"] )  + "', " \
 53 |                                 "tags: '"+ str(item["tags"] )  + "', " \
 54 |                                 "views: '"+ str(item["views"] )  + "', "\
 55 |                                 "answers: '"+ str(item["answers"] )  + "', " \
 56 |                                 "resolve: '"+ str(item["resolve"] )  + "', " \
 57 |                                 "upvotes: '"+ str(item["upvotes"] )  + "', "\
 58 |                                 "text: '" + str(item["text"]) + "' }\n"
 59 |             except UnicodeEncodeError:
 60 |                 print("Unicode Encode Exception!")
 61 | 
 62 |         out_string += " ]\n\n"
 63 |         return out_string
 64 | 
 65 | 
 66 |     #for each thread scraped, insert it into json file
 67 |     def insert_items_into_file(self, threads):
 68 |         try:
 69 |             self.out_file = open("scnscraper/abap.json", "a") # open in append mode
 70 |             #convert into string and insert into file
 71 |             self.out_file.write(self.threads_to_str(threads))
 72 |             self.out_file.close()
 73 |         except:
 74 |             print('Exception in writing file')
 75 |             self.out_file.close()
 76 | 
 77 | 
 78 |     # read the web page index
 79 |     def read_index_from_file(self):
 80 |         if os.path.exists('scnscraper/index.txt'):
 81 |             with open('scnscraper/index.txt') as f:
 82 |                 index = int(f.readline())
 83 |                 f.close()
 84 |         else:
 85 |             f = open('scnscraper/index.txt', 'w')
 86 |             index = 2
 87 |             f.write(str(index))
 88 |             f.close()
 89 |         return index
 90 | 
 91 |     # Write the web page index
 92 |     def write_index_into_file(self, i):
 93 |         f = open('scnscraper/index.txt', 'w')
 94 |         f.write(str(i))
 95 |         f.close()
 96 | 
 97 | 
 98 |     # Convert the content of json file into a new db
 99 |     def from_json_to_db(self):
100 |         thread = ''
101 |         db = Base("scnscraper/abap.pydb", save_to_file= True)
102 |         # create new base with field names
103 |         db.create('url', 'uid', 'type', 'author',
104 |                        'title', 'date_time', 'tags', 'views',
105 |                        'answers', 'resolve', 'upvotes', 'text', mode='override')
106 |         i=0
107 |         with open('scnsraper/threads.json', 'r') as file:
108 |             for line in file:
109 |                 if(line.endswith(" }\n")):
110 |                     thread += line
111 |                     tokens = re.search(r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread))
112 |                     if tokens is not None:
113 |                         db.insert(url = tokens.group(1), uid = tokens.group(2), type= tokens.group(3),
114 |                                 author=tokens.group(4), title = tokens.group(5), date_time = tokens.group(6),
115 |                                 tags = tokens.group(7), views = tokens.group(8), answers = tokens.group(9),
116 |                                 resolve = tokens.group(10), upvotes = tokens.group(11), text = tokens.group(12))
117 |                         db.commit()
118 |                     print ('\n--------------------------------------------\n')
119 |                     thread = ''
120 |                 if(line.startswith(" ]")):
121 |                     print("new page")
122 |                     thread = ''
123 |                 if(line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))):
124 |                     thread += line
125 | 
126 | 
127 |     def state_extraction():
128 |         db = Base("scnscraper/abap.pydb")
129 |         if db.exists():
130 |             db.open()
131 |             record = db(type = "Question")
132 |             print("# discussion scraped: " + str(record.__len__()))
133 |             print("Answered: " + str(db(resolve = "Answered.").__len__()))
134 |             print("Answered with solution: "+ str(db(resolve = "solution").__len__()))
135 |             print("Not Answered: " + str(db(resolve = "Not Answered.").__len__()))
136 |             print("Assumed Answered: " + str(db(resolve = "Assumed Answered.").__len__()))
137 | 
138 |     state_extraction = staticmethod(state_extraction)
139 | 
140 | if __name__ == '__main__':
141 |     DataStoring.state_extraction()
142 | 


--------------------------------------------------------------------------------
/scn/scnscraper/items.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scrapy.item import Item, Field
 3 | 
 4 | class SapItem(Item):
 5 |     uid = Field() # user id, unique and identifier for each post
 6 |     type = Field()  # question, answer
 7 |     author = Field()
 8 |     title = Field()
 9 |     text = Field()
10 |     date_time = Field()
11 |     tags = Field()
12 |     views = Field()
13 |     answers = Field() # #answers
14 |     resolve = Field()
15 |     upvotes = Field()  # likes
16 |     url = Field()
17 | 
18 |     def __str__(self):
19 |         return "Item(" + str(self['type']) + ") #" + str(self['uid'])
20 | 


--------------------------------------------------------------------------------
/scn/scnscraper/main.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Salvatore Cassano'
 2 | 
 3 | from scraper import Scraper
 4 | from dataStoring import DataStoring
 5 | 
 6 | class MainApp():
 7 | 
 8 | 
 9 |     if __name__ == '__main__':
10 |         startUrl = "http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]&start="
11 |         storing = DataStoring()
12 |         #read the input param
13 |         i = storing.read_index_from_file()
14 |         completeUrl = ""
15 |         print("\n\n-------- SCRAPER STARTED ---\n")
16 |         while (i<5000):
17 |             #string concatenation to get the complete URL
18 |             completeUrl = startUrl + str(20*i)
19 |             #threads scraped from URL
20 |             threads = []
21 |             print("------ SCRAPING NEW WEB PAGE (PAGE " + str(i) +") ---\n")
22 |             SCNScraper = Scraper(completeUrl)
23 |             #get threads
24 |             threads = SCNScraper.scraping()
25 |             #save content into json file
26 |             storing.insert_items_into_file(threads)
27 |             #save content into db
28 |             storing.insert_items_into_db(threads)
29 |             i = i+1
30 |             #update index file
31 |             storing.write_index_into_file(i)
32 | 
33 | 


--------------------------------------------------------------------------------
/scn/scnscraper/scraper.py:
--------------------------------------------------------------------------------
  1 | 
  2 | __author__ = 'Salvatore Cassano'
  3 | 
  4 | import re
  5 | from selenium import webdriver
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions as EC
  8 | from selenium.common.exceptions import TimeoutException
  9 | from selenium.webdriver.common.by import By
 10 | import time
 11 | from items import SapItem
 12 | 
 13 | class Scraper():
 14 | 
 15 |     #Inizialize an instantiated object setting Firefox as browser and setting the url
 16 |     def __init__(self, url):
 17 |         #self.driver = webdriver.Firefox()
 18 |         try:
 19 |             self.driver = webdriver.PhantomJS('scnscraper/phantomjs.exe')
 20 |         except:
 21 |             print('Please insert Phantomjs into directory and try again. PRESS ENTER TO CONTINUE...\n')
 22 |             raw_input()
 23 |         self.driver.get(url)
 24 | 
 25 | 
 26 |     def scraping(self):
 27 |         driver = self.driver
 28 |         delay = 100 # number of seconds
 29 |         linkOccurrences = 0 # number of link to scrape in the page
 30 |         start_url = str(driver.current_url)
 31 |         page_state = self.driver.execute_script('return document.readyState;') #wait until page is ready
 32 |         print("Loading page content...")
 33 |         while True: #repeat until content is loaded from the server db
 34 |             try:
 35 |                 #find and click on previous button
 36 |                 web_page = driver.find_element_by_class_name('j-pagination-prev')
 37 |                 web_page.click()
 38 |                 #wait until the loading is ultimated
 39 |                 time.sleep(WebDriverWait(driver, delay).until_not(
 40 |                            EC.presence_of_element_located((By.CLASS_NAME, 'j-loading-container'))))
 41 |             except TimeoutException:
 42 |                 print "Loading took too much time!"
 43 |             #takes the number of link that need to be considered, it excludes link of discussions that are not marked as
 44 |             #answered or not answered and link of discussions that have ANONYMOUS user.
 45 |             linkOccurrences = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and "
 46 |                                                             ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] "
 47 |                                                             "or td[@class='j-td-icon' and "
 48 |                                                             ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]"
 49 |                                                             "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__()
 50 |             if (linkOccurrences!=0):
 51 |                 break
 52 |         print("Content loaded with success!\n")
 53 |         index = 0 #link occurrences iterator
 54 |         print("--- Scraping threads from web page's link ---\n")
 55 |         items = [] #items scraped, initializing output
 56 |         while index < linkOccurrences:
 57 |             #check if the url have an error, then stop the program
 58 |             if 'http://scn.sap.com/community/abap/content?start=' in str(driver.current_url):
 59 |                 print("--- ERROR IN PAGE LOADING ---")
 60 |                 return
 61 |             #takes the reference of link that need to be scape, it excludes link of discussions that are not marked as
 62 |             #answered or not answered and link of discussions that have ANONYMOUS user.
 63 |             link = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and "
 64 |                                                  ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] "
 65 |                                                  "or td[@class='j-td-icon' and "
 66 |                                                  ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]"
 67 |                                                  "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a")[index]
 68 |             web_page = link.click() #click the link selected
 69 |             #wait until page is loaded
 70 |             WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'js-original-header')))
 71 |             resolve = [] # says if element is answered or not answered
 72 |             url = str(driver.current_url)
 73 |             try:
 74 |                 #select the element [answered, not answered, assumed answered]
 75 |                 element = driver.find_element_by_xpath("//header[@class='js-original-header']//p/strong").text.encode('utf8')
 76 |             except:
 77 |                 time.sleep(4)
 78 |                 #sleep until element is completely loaded
 79 |                 try:
 80 |                     #repeat the selection
 81 |                     element = driver.find_element_by_xpath("//header[@class='js-original-header']//p/strong").text.encode('utf8')
 82 |                 except:
 83 |                     print('Element not Found')
 84 |                     element = "Not Answered."
 85 |             resolve.append(element)
 86 |             if(str(element).__eq__("Answered.")):
 87 |                     #take the date of solution
 88 |                     date = str(driver.find_element_by_xpath("//span[@class='font-color-meta j-line2']").text.encode('utf8'))
 89 |                     solution_date = str(re.sub('by.*?on ', "", date))
 90 |                     try:
 91 |                         #take the solution user
 92 |                         solution_user = str(driver.find_element_by_xpath(
 93 |                                             "//span[@class='font-color-meta j-line2']/a").text.encode('utf8'))
 94 |                     except:
 95 |                         solution_user = 'ANONYMOUS'
 96 |             else:
 97 |                 solution_date = "---"
 98 |                 solution_user = "---"
 99 |             resolve.append(solution_date)
100 |             resolve.append(solution_user)
101 |             #select the number of post in a thread
102 |             postOccurrences = driver.find_elements_by_xpath("//a[@class='jiveTT-hover-user jive-username-link']").__len__()
103 |             i = 0 # number of occurrences iterator
104 |             while i < postOccurrences:
105 |                 item = SapItem() # new Item instance
106 |                 try:
107 |                     # select the author in i position
108 |                     item["author"] = driver.find_elements_by_xpath("//a[@class='jiveTT-hover-user jive-username-link']")\
109 |                                      .pop(i).text.encode('utf8')
110 |                 except:
111 |                     item["author"] = 'ANONYMOUS'
112 |                 # select the url in i position
113 |                 item["url"] = url
114 |                 # generate the uid in i position
115 |                 item["uid"] = (str(url.replace("http://scn.sap.com/thread/", ""))) + "." + str(i+1)
116 |                 # select the title
117 |                 title = driver.find_element_by_xpath("//header[@class='js-original-header']//h1//a").text.encode('utf8')
118 |                 if(i==0):
119 |                     item["type"] = "Question"
120 |                     item["title"] = title
121 |                 else:
122 |                     item["type"] = "Answer"
123 |                     item["title"] = "re: " + title
124 |                 # select the text in i position
125 |                 if (str(element).__eq__("Answered.")) and (i>0):
126 |                     item["text"] = driver.find_elements_by_class_name("jive-rendered-content").pop(i+1).text.encode('utf8')
127 |                 else:
128 |                     item["text"] = driver.find_elements_by_class_name("jive-rendered-content").pop(i).text.encode('utf8')
129 |                 if (i==0):
130 |                     try:
131 |                         # select the date_time for question
132 |                         item["date_time"] = driver.find_elements_by_xpath("//span[@class='j-post-author']"
133 |                                                                           ).pop(0).text.encode('utf8').split('\n', 1)[-1]
134 |                     except IndexError:
135 |                         #select and obtain the date_time from selector
136 |                         item["date_time"] = ""
137 |                         stringXpath = driver.find_elements_by_class_name('j-post-author ')
138 |                         date_extracted = stringXpath[i].text.encode('utf8')
139 |                         #regular expression to get from string selected the date_time
140 |                         list_of_re = re.findall('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (.*?) (AM|PM) ',
141 |                                                 str(date_extracted))
142 |                         item["date_time"] = list_of_re.pop().__str__().replace("('", "").replace("', '", " ").replace("')", "")
143 |                 else:
144 |                     #select and obtain the date_time from selector
145 |                     item["date_time"] = ""
146 |                     stringXpath = driver.find_elements_by_class_name('j-post-author ')
147 |                     date_extracted = stringXpath[i].text.encode('utf8')
148 |                     try:
149 |                         #regular expression to get from string selected the date_time
150 |                         list_of_re = re.findall('(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (.*?) (AM|PM) ',
151 |                                                 str(date_extracted))
152 |                         item["date_time"] = list_of_re.pop().__str__().replace("('", "").replace("', '", " ").replace("')", "")
153 |                     except UnicodeEncodeError:
154 |                         item["date_time"] = date_extracted
155 |                     except IndexError:
156 |                         print("Index Exception")
157 |                         item["date_time"] = driver.find_elements_by_xpath("//span[@class='j-post-author']"
158 |                                                                           ).pop(1).text.encode('utf8').split('\n', 1)[-1]
159 |                 if (i==0):
160 |                     # select the tags, if exists, for a question
161 |                     tags = driver.find_elements_by_class_name("jive-thread-post-details-tags")
162 |                     if len(tags) != 0:
163 |                         list_of_tags = []
164 |                         for tags in tags:
165 |                             list_of_tags.append(tags.text.encode('utf8'))
166 |                         item["tags"] = list_of_tags
167 |                     else:
168 |                         item["tags"] = "null"
169 |                 else:
170 |                     item["tags"] = "null"
171 |                 if (i==0):
172 |                     # select the views for a question
173 |                     item["views"] = driver.find_elements_by_xpath("//span[@class='jive-content-footer-item']"
174 |                                                                   ).pop(i).text.encode('utf8').replace(" Views", "")
175 |                     # select the answers for a question
176 |                     item["answers"] = postOccurrences-1
177 |                     # this attribute isn't available for answers, then it's set with a null value
178 |                     item["upvotes"] = "---"
179 |                     item["resolve"] = resolve[0]
180 |                 else:
181 |                     # this attribute isn't available for answers, then it's set with a null value
182 |                     item["views"] = 0
183 |                     # this attribute isn't available for answers, then it's set with a null value
184 |                     item["answers"] = "---"
185 |                     # select the upvotes for an answer
186 |                     item["upvotes"] = driver.find_element_by_class_name(" jive-acclaim-likedlink").text.encode('utf8')
187 |                     # check the resolve value
188 |                     if(str(resolve[0]).__eq__("Not Answered.")):
189 |                         # when discussion is Not Answered the solution not exists
190 |                         item["resolve"] = "---"
191 |                     else:
192 |                         # when the solution is Answered, check if the post i is solution by comparing
193 |                         # the author and the date_time with the author and the date of solution
194 |                         try:
195 |                             if (str(item["author"]).__eq__(resolve[2])) and (str(item["date_time"]).__eq__(resolve[1])):
196 |                                 item["resolve"] = "solution"
197 |                             else:
198 |                                 item["resolve"] = "---"
199 |                         except UnicodeEncodeError:
200 |                             item["resolve"] = "---"
201 |                 # append the thread scraped
202 |                 items.append(item)
203 |                 print("--- " + str(item) + " scraped ---")
204 |                 # go to the next link
205 |                 i=i+1
206 |             # come back to the previous page (link's page)
207 |             web_page = driver.back()
208 |             # wait until the page element required is loaded
209 |             WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'j-pagination-prev')))
210 |             while True: #repeat until content is loaded from the server db
211 |                 try:
212 |                     # find and click on previous button
213 |                     web_page = driver.find_element_by_class_name('j-pagination-prev')
214 |                     web_page.click()
215 |                     # wait until the loading is ultimated
216 |                     time.sleep(WebDriverWait(driver, delay).until_not(EC.presence_of_element_located
217 |                                                                       ((By.CLASS_NAME, 'j-loading-container'))))
218 |                 except TimeoutException:
219 |                     print "Loading took too much time!"
220 |                 condition = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and "
221 |                                                           ".//img[@class = 'jive-icon-discussion-question jive-icon-med']]"
222 |                                                           " or td[@class='j-td-icon' and "
223 |                                                           ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]"
224 |                                                           "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__()
225 |                 # check is the loading is terminated with success, then go next
226 |                 if (condition!=0):
227 |                     break
228 |             #repeat until content is loaded from the server db
229 |             while True:
230 |                 try:
231 |                     # find and click on previous button
232 |                     web_page = driver.find_element_by_class_name('j-pagination-next')
233 |                     web_page.click()
234 |                     # wait until the loading is ultimated
235 |                     time.sleep(WebDriverWait(driver, delay).until_not(EC.presence_of_element_located
236 |                                                                       ((By.CLASS_NAME, 'j-loading-container'))))
237 |                 except TimeoutException:
238 |                     print "Loading took too much time!"
239 |                 condition = driver.find_elements_by_xpath("//tr[td[@class='j-td-icon' and "
240 |                                                           ".//img[@class = 'jive-icon-discussion-question jive-icon-med']] "
241 |                                                           "or td[@class='j-td-icon' and "
242 |                                                           ".//img[@class = 'jive-icon-discussion-correct jive-icon-med']]]"
243 |                                                           "[td[@class='j-td-author']/a]//td[@class = 'j-td-title']//a").__len__()
244 |                 # check is the loading is terminated with success, then go next
245 |                 if (condition!=0):
246 |                     break
247 |             # increment the links page iterator
248 |             index = index + 1
249 |             print("\n--- Threads scraped with success! ---")
250 |         print("\n--- Going to another page... ---\n")
251 |         #close the web page
252 |         driver.close()
253 |         return(items)
254 | 


--------------------------------------------------------------------------------
/yahoo-answers/README.md:
--------------------------------------------------------------------------------
 1 | <h1>Yahoo! Answer scraper</h1>
 2 | --------
 3 | <p>This work provides web-scraping scripts developed in Python 2.7. They aims to extract Questions and Answers from "Programming & Design" category located in Yahoo! Answer website.</p>
 4 | 
 5 | <p>There are two main script:</p>
 6 | * yahoourlextractor
 7 | * yahooscraper
 8 | 
 9 | <h5>yahoourlextractor</h5>
10 | Provide crawl mechanics in order to gain much as possible URLs related to Programming & Design Question Thread.
11 | This script use Selenium WebDriver in order to handle the "Infinite Scroll" present in the P&D homepage and Scrapy in order to scrape URL from other element available in Question Thread pages.
12 | All the URLs are stored in a PyDbLite database with info about the question insertion date, if are present.
13 | 
14 | <h5>yahooscraper</h5>
15 | This script use the first database provided by yahoourlextractor in order to start the scraping process of the questions and answers.
16 | Reading any URL in the database, he send Scrapy multiple requests. Every question and answer will be a Scrapy Item, with precise structure, and will be processed by Scrapy Pipeline in order to store Items in a new Database called QuestionExtracted.pdl.
17 | 
18 | <h2>Installation</h2>
19 | --------
20 | 
21 | 1. Download the content of this directory
22 | 2. Install all the requirements with: `pip install -r requirements.txt` 
23 | 3. Download [PhantomJS](http://phantomjs.org/) (for Windows or OSX) and unzip
24 | 4. Move phantomjs binary package into `yahoourlextractor/YahooUrlSearcher/spiders`
25 | 
26 | <h2>Start with the scripts</h2>
27 | ---
28 | 
29 | 1. Start the first shell script  `/yahoo-answer/yahoourlextractor.sh` in order to obtain URLs database called `URL_Database.pdl`
30 | 2. Move `URL_Database.pdl` or another database obtained by yahoourlextractor script in /yahoo-answer/yahooscraper/spiders
31 | 3. Start the second shell script  `/yahoo-answer/yahooscraper.sh` this script need one arguments refered to the name of database URL. 
32 | 
33 | In the `yahoourlextractor/YahooUrlSearcher/spiders` you obtain the database containing the questions and answers scraped from Yahoo Answers. By default the name of this DB is `QuestionThreadExtracted.pdl`. The script also provide a .txt Log about amount of scraped data and JSON file for the Item stored in the DB.
34 | 
35 | <h2>Notes</h2>
36 | ---
37 | In the `yahoourlextractor/YahooUrlSearcher/spiders` dir are present an example URL database called `example_database.pdl`. So it's possibile run a test from command line using `cd /yahoo-answer/yahooscraper.sh` and `./yahooscraper.sh example_database.pdl` command.
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/yahoo-answers/discretizer/discretizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     Compatible with Python 2 and Python 3
  4 | """
  5 | 
  6 | import csv
  7 | import logging
  8 | import os
  9 | import re
 10 | from math import log
 11 | 
 12 | from dateutil.parser import parse as parse_date
 13 | from nltk import FreqDist
 14 | from nltk import PorterStemmer
 15 | from nltk import tokenize
 16 | from nltk import word_tokenize
 17 | from pydblite.pydblite import Base
 18 | 
 19 | 
 20 | class Discretizer:
 21 |     logging.basicConfig(level=logging.DEBUG)
 22 |     logger = logging.getLogger(__name__)
 23 |     linesep = '\n'
 24 | 
 25 |     def __init__(self, db_name, db_files):
 26 |         self.db_name = db_name
 27 |         self.db_files = db_files
 28 |         self.db = dict()
 29 | 
 30 |     def log(self, msg, level=logging.DEBUG):
 31 |         self.logger.log(level, msg)
 32 | 
 33 |     def load_db(self, check=True, fix=False, save_to_file=False):
 34 |         self.log('Opening {0} database(s)'.format(len(self.db_files)), logging.INFO)
 35 |         for db_name, db_file in self.db_files.items():
 36 |             _db = Base(db_file, save_to_file=save_to_file)
 37 |             _db.open()
 38 |             self.log('Database {0} opened, records #: {1}'.format(db_name, len(_db)), logging.DEBUG)
 39 |             self.db.update({db_name: _db})
 40 |             _db.create_index('uid')
 41 |             _db.create_index('type')
 42 |         if check is True:
 43 |             self.check_db(fix)
 44 | 
 45 |     """
 46 |     * fix answers_count with actual # of answers exported
 47 |     * if an answer has tag != N/A, the tags must be applied to the question in the same thread
 48 |     * if a question is marked as resolved True, then one of the answers in the thread must have been marked as solution;
 49 |     and viceversa;
 50 |     * check if Q or A text is ''
 51 |     * turn question uid from int to unicode string
 52 |     """
 53 | 
 54 |     def check_db(self, fix=False):
 55 |         self.log('Checking consistency for databases.', logging.INFO)
 56 |         for name, _db in self.db.items():
 57 |             for question in _db._type['question']:
 58 |                 expected_answers_count = int(question['answers'])
 59 |                 actual_answers_count = 0
 60 |                 for i in range(1, expected_answers_count + 1):
 61 |                     try:
 62 |                         _db._uid[str(question['uid']) + '.' + str(i)][0]
 63 |                         actual_answers_count += 1
 64 |                     except IndexError:
 65 |                         break
 66 |                 if actual_answers_count < expected_answers_count:
 67 |                     self.log('Fixing answers count mismatch in thread id {0}, expected {1}, found {2}'.
 68 |                              format(question['uid'], expected_answers_count, actual_answers_count))
 69 |                     _db.update(question, answers=actual_answers_count)
 70 | 
 71 |             for record in (_db('text') == ''):
 72 |                 self.log('Warning on record {0} from db {1}: empty text!'.format(record['uid'], name),
 73 |                          logging.WARNING)
 74 | 
 75 |             for record in (_db('type') == 'answer') & (_db('tags') != 'N/A'):
 76 |                 self.log('Warning on record {0} from db {1}: tags in answer!'.format(record['uid'], name),
 77 |                          logging.WARNING)
 78 |                 question_uid = record['uid'].split('.')[0]
 79 |                 question = _db._uid[question_uid][0]
 80 |                 question_tags = question['tags'] + '.' + record['tags']
 81 |                 _db.update(question, tags=question_tags)
 82 | 
 83 |             if fix is True:
 84 |                 _db.commit()
 85 | 
 86 |     def load_threads(self):
 87 |         self.log('Loading threads from {0} db(s)'.format(len(self.db_files)), logging.INFO)
 88 |         overall_threads = list()
 89 |         for name, _db in self.db.items():
 90 |             db_threads = list()
 91 |             questions = _db._type['question']  # use db index
 92 |             self.log('Loaded {0} questions (threads) from db {1}, attaching answers...'.format(len(questions), name),
 93 |                      logging.DEBUG)
 94 |             for question in questions:
 95 |                 answers = self._get_answers(question['uid'], int(question['answers']), _db)
 96 |                 db_threads.append({'question': question, 'question_uid': question['uid'],
 97 |                                    'date_time': question['date_time'], 'answers_count': question['answers'],
 98 |                                    'resolved': question['resolve'], 'tags': question['tags'], 'answers': answers})
 99 | 
100 |             overall_threads.extend(db_threads)
101 |         self.log('Overall threads loaded: {0} from {1} database(s)'.format(len(overall_threads), len(self.db_files)))
102 |         return overall_threads
103 | 
104 |     def _get_answers(self, question_id, answers_count, _db):
105 |         self.log('Getting {0} answers for thread id {1}'.format(answers_count, question_id), logging.DEBUG)
106 |         answers = list()
107 |         if answers_count > 0:
108 |             for i in range(1, answers_count + 1):
109 |                 answer_id = '{0}.{1}'.format(question_id, i)
110 |                 for answer in (_db._uid[answer_id]):  # use index
111 |                     answers.append(answer)
112 |             if answers_count != len(answers):
113 |                 self.log('Warning in thread id {0}: loaded {1} answers, expected {2}. Please, run a check db with '
114 |                          'fix=True'.format(question_id, len(answers), answers_count),
115 |                          logging.WARNING)
116 |         return answers
117 | 
118 |     def compute_features(self, threads, stemmed_vocabulary, distrib_matrix):
119 |         self.log('Computing features. Please, wait. This will take some serious time...', logging.INFO)
120 |         for thread in threads:
121 |             self.log('Computing features for thread id {0}'.format(thread['question_uid']), logging.INFO)
122 |             try:
123 |                 base_date = parse_date(thread['date_time'])
124 |             except ValueError:
125 |                 base_date = parse_date('1970-01-01')
126 |             except AttributeError:
127 |                 base_date = thread['date_time']
128 |             answers = thread['answers']
129 |             tag_list = thread['tags'].split('.')
130 |             if '' in tag_list:
131 |                 tag_list.remove('')
132 |             for answer in answers:
133 |                 # compute thread tags
134 |                 answer_tags = answer['tags'].split()
135 |                 if 'N/A' in answer_tags:
136 |                     answer_tags.remove('N/A')
137 |                 tag_list.extend(answer_tags)
138 |                 thread['tags'] = sorted(set(tag_list))
139 | 
140 |                 # compute len in chars and words
141 |                 alen = len(answer['text'])
142 |                 answer['len'] = alen
143 |                 wordcount = Discretizer._count_words(answer['text'])
144 |                 answer['wordcount'] = wordcount
145 |                 if wordcount == 0:
146 |                     answer['avg_chars_per_word'] = 0
147 |                 else:
148 |                     answer['avg_chars_per_word'] = "{0:.2f}".format(alen / float(wordcount))  # float with 2 decimals
149 |                 try:
150 |                     sentences = tokenize.sent_tokenize(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
151 |                                                        language='english')
152 |                 except (AttributeError, TypeError) as e:
153 |                     sentences = tokenize.sent_tokenize(str(answer['text']), language='english')
154 |                 sentence_count = len(sentences)
155 |                 answer['sentences'] = sentence_count
156 |                 if sentence_count == 0:
157 |                     words_per_sentence = 0
158 |                 else:
159 |                     words_per_sentence = "{0:.2f}".format(wordcount / float(sentence_count))
160 |                 answer['avg_words_per_sentence'] = words_per_sentence
161 |                 longest_sentence = 0
162 |                 for s in sentences:
163 |                     l = Discretizer._count_words(s)
164 |                     if l > longest_sentence:
165 |                         longest_sentence = l
166 |                 answer['longest_sentence'] = longest_sentence
167 |                 try:
168 |                     creation_date = parse_date(answer['date_time'])
169 |                 except AttributeError:
170 |                     creation_date = answer['date_time']
171 |                 time_difference = abs((creation_date - base_date).total_seconds())
172 |                 answer['time_difference'] = time_difference
173 | 
174 |                 # TODO upvotes score
175 | 
176 |                 # check for urls and code snippets
177 |                 match = re.search(r'http(s)?://', str(answer['text']), re.MULTILINE)
178 |                 if match:
179 |                     answer['has_links'] = True
180 |                 else:
181 |                     answer['has_links'] = False
182 | 
183 |                 answer['has_code_snippet'] = self._has_codesnippet(str(answer['text']))
184 |                 try:
185 |                     LL = Discretizer._log_likelihood(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
186 |                                              stemmed_vocabulary, distrib_matrix)
187 |                 except (AttributeError, TypeError) as e:
188 |                     LL = Discretizer._log_likelihood(str(answer['text']), stemmed_vocabulary, distrib_matrix)
189 |                 answer['loglikelihood'] = LL
190 |                 answer['loglikelihood_descending'] = LL
191 |                 answer['loglikelihood_ascending'] = LL
192 |                 try:
193 |                      aspw = Discretizer._ASPW(answer['text'].decode('utf-8', 'replace').encode('ascii', 'replace'))
194 |                 except (AttributeError, TypeError) as e:
195 |                      aspw = Discretizer._ASPW(str(answer['text']))
196 |                 fk = Discretizer._FK(answer['avg_words_per_sentence'], aspw)
197 |                 answer['F-K'] = fk
198 |                 answer['F-K_descending'] = fk
199 |                 answer['F-K_ascending'] = fk
200 | 
201 |             # compute ranks
202 |             answers = Discretizer._sort_rank(answers, 'upvotes', reverse=True)
203 |             answers = Discretizer._sort_rank(answers, 'sentences', reverse=True)
204 |             answers = Discretizer._sort_rank(answers, 'len', reverse=True)
205 |             answers = Discretizer._sort_rank(answers, 'views', reverse=True)
206 |             answers = Discretizer._sort_rank(answers, 'wordcount', reverse=True)
207 |             answers = Discretizer._sort_rank(answers, 'avg_chars_per_word', reverse=True)
208 |             answers = Discretizer._sort_rank(answers, 'avg_words_per_sentence', reverse=True)
209 |             answers = Discretizer._sort_rank(answers, 'longest_sentence', reverse=True)
210 |             answers = Discretizer._sort_rank(answers, 'time_difference', reverse=False)
211 |             answers = Discretizer._sort_rank(answers, 'loglikelihood_descending', reverse=True)
212 |             answers = Discretizer._sort_rank(answers, 'loglikelihood_ascending', reverse=False)
213 |             answers = Discretizer._sort_rank(answers, 'F-K_descending', reverse=True)
214 |             answers = Discretizer._sort_rank(answers, 'F-K_ascending', reverse=False)
215 |             thread['answers'] = answers
216 | 
217 |         self.log('Done computing features for {0} threads'.format(len(threads)), logging.INFO)
218 |         return threads
219 | 
220 |     @staticmethod
221 |     def _ASPW(text):
222 |         aspw = 0
223 |         for word in text.split():
224 |             s = Discretizer._count_syllables(word)
225 |             aspw += s
226 |         return aspw
227 | 
228 |     @staticmethod
229 |     def _count_syllables(word):
230 |         vowels = ['a', 'e', 'i', 'o', 'u', 'y']
231 |         currentWord = list(word)
232 |         numVowels = 0
233 |         lastWasVowel = False
234 |         for wc in currentWord:
235 |             foundVowel = False
236 |             for v in vowels:
237 |                 # don't count diphthongs
238 |                 if (v == wc) and lastWasVowel is True:
239 |                     foundVowel = True
240 |                     lastWasVowel = True
241 |                     break
242 |                 elif (v == wc) and lastWasVowel is False:
243 |                     numVowels += 1
244 |                     foundVowel = True
245 |                     lastWasVowel = True
246 |                     break
247 | 
248 |             # If full cycle and no vowel found, set lastWasVowel to false;
249 |             if not foundVowel:
250 |                 lastWasVowel = False
251 | 
252 |         # Remove es, it's _usually? silent
253 |         if (len(word) > 2) and (word[len(word)-2:] == "es"):
254 |             numVowels -= 1
255 |         # remove silent e
256 |         elif (len(word) > 1) and (word[len(word)-1:] == "e"):
257 |             numVowels -= 1
258 |         return numVowels
259 | 
260 |     @staticmethod
261 |     def _FK(awps, asps):
262 |         fk = (0.39 * float(awps)) + (11.8 * float(asps)) - 15.59
263 |         return fk
264 | 
265 |     @staticmethod
266 |     def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
267 |         LL = 0
268 |         if answer_text is not '':
269 |             tokens = word_tokenize(str(answer_text), language='english')
270 |             porter_stemmer = PorterStemmer()
271 |             unique_wordcount = len(stemmed_vocabulary)
272 |             """
273 |             per ogni w unica print_function words
274 |                 Cw = conta w in answer_text
275 |                 PwM = self.distrib_matrix[stemmer(w)]
276 |                 unique_wordcount = len(tokenize(answer_text)
277 |             """
278 |             for w in tokens:
279 |                 _w = w.strip().lower()
280 |                 Cw = 0
281 |                 for _ in answer_text.split():
282 |                     if _w == _.strip().lower():
283 |                         Cw += 1
284 | 
285 |                 try:
286 |                     w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
287 |                 except AttributeError:
288 |                     w_stem = porter_stemmer.stem(_w)
289 |                 try:
290 |                     PwM = distrib_matrix[w_stem]
291 |                 except KeyError:  # key error means frequency is equal to cutoff point 1
292 |                     PwM = 1
293 |                 LL += (Cw * log(float(PwM)))
294 | 
295 |             try:
296 |                 LL = "{0:.2f}".format(LL / float(unique_wordcount))
297 |             except ZeroDivisionError:
298 |                 LL = 0 
299 | 
300 |         return LL
301 | 
302 |     @staticmethod
303 |     def _count_words(text):
304 |         wordcount = 0
305 |         for word in text.split():
306 |             wordcount += 1
307 |         return wordcount
308 | 
309 |     @staticmethod
310 |     def _sort_rank(answers, key, reverse=True):
311 |         new_list = sorted(answers, key=lambda x: float(x[key]), reverse=reverse)
312 |         ranks = dict()
313 |         for i in range(0, len(answers)):
314 |             ranks[new_list[i]['uid']] = i + 1
315 | 
316 |         # fix rank ties
317 |         for i in range(0, len(answers)-1):
318 |             if new_list[i][key] == new_list[i+1][key]:
319 |                 ranks[new_list[i+1]['uid']] = ranks[new_list[i]['uid']]
320 | 
321 |         for k, v in ranks.items():
322 |             for a in answers:
323 |                 if a['uid'] == k:
324 |                     a['{0}_rank'.format(key)] = v
325 |         return answers
326 | 
327 |     def _has_codesnippet(self, text):
328 |         code = False
329 |         if re.search(r'({|}| package |\.jar| class | namespace |exception |<<| end | def |<\?php| soap | <xml| wsdl |\.cs|\.java|\.php|\.rb|lambda)',
330 |                      text, re.MULTILINE | re.IGNORECASE):
331 |                 code = True
332 |         return code
333 | 
334 |     def build_vocabulary(self, stems):
335 |         vocabulary_filename = '{0}_vocabulary.txt'.format(self.db_name)
336 |         if os.path.isfile(vocabulary_filename):  # load vocabulary from file
337 |             words = list()
338 |             self.log('Loading existing community vocabulary from {0}'.format(vocabulary_filename), logging.INFO)
339 |             with open(vocabulary_filename, 'rt') as f:
340 |                 for word in f:
341 |                     words.append(word.strip())
342 |                 f.close()
343 |             vocabulary = sorted(set(words))
344 |         else:  # create vocabulary and save file
345 |             self.log('Creating new vocabulary into {0}. Please wait, this may take some time.'.
346 |                      format(vocabulary_filename), logging.INFO)
347 |             vocabulary = sorted(set(stems))
348 |             with open(vocabulary_filename, 'wt') as f:
349 |                 for lemma in vocabulary:
350 |                     f.write('{0}{1}'.format(lemma, self.linesep))
351 |                 f.close()
352 |         return vocabulary
353 | 
354 |     def build_corpus(self):
355 |         corpus_filename = '{0}_corpus.txt'.format(self.db_name)
356 |         corpus = list()
357 |         if os.path.isfile(corpus_filename):  # load corpus from file
358 |             self.log('Loading existing corpus from {0}'.format(corpus_filename), logging.INFO)
359 |             with open(corpus_filename, 'rt') as f:
360 |                 for word in f:
361 |                     corpus.append(word.strip())
362 |                 f.close()
363 |         else:
364 |             self.log('Creating corpus from {0} database(s). Please wait, this may take some time.'.format(
365 |                 len(self.db_files)), logging.INFO)
366 |             with open(corpus_filename, 'wt') as f:
367 |                 for name, _db in self.db.items():
368 |                     self.log('Updating corpus from db {0}.'.format(name), logging.DEBUG)
369 |                     for record in _db:
370 |                         try:
371 |                             tokens = word_tokenize(record['text'].decode('utf-8', 'replace').encode('ascii', 'replace'),
372 |                                                    language='english')
373 |                         except (AttributeError, TypeError) as e:
374 |                             tokens = word_tokenize(str(record['text']), language='english')
375 |                         for t in tokens:
376 |                             corpus.append(t)
377 |                             f.write('{0}{1}'.format(t, self.linesep))
378 |                 f.close()
379 |         return corpus
380 | 
381 |     def build_stems(self, corpus):
382 |         stems_filename = '{0}_stems.txt'.format(self.db_name)
383 |         if os.path.isfile(stems_filename):  # load stems from file
384 |             stems = list()
385 |             self.log('Loading existing stems from {0}'.format(stems_filename), logging.INFO)
386 |             with open(stems_filename, 'rt') as f:
387 |                 for stem in f:
388 |                     stems.append(stem.strip())
389 |                 f.close()
390 |         else:
391 |             self.log('Creating stems from corpus into {0}. Please wait, this may take some time.'.format(
392 |                 stems_filename), logging.INFO)
393 |             porter_stemmer = PorterStemmer()
394 |             try:
395 |                 stems = [porter_stemmer.stem(token.lower().decode('utf-8', 'replace').encode('ascii', 'replace'))
396 |                          for token in corpus]
397 |             except (AttributeError, TypeError) as e:
398 |                 stems = [porter_stemmer.stem(token.lower())
399 |                          for token in corpus]
400 |             with open(stems_filename, 'wt') as f:
401 |                 for stem in stems:
402 |                     f.write('{0}{1}'.format(stem, self.linesep))
403 |                 f.close()
404 |         return stems
405 | 
406 |     def build_distribution_matrix(self, stems):
407 |         distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name)
408 |         if os.path.isfile(distrib_matrix_filename):  # load matrix from file
409 |             self.log('Loading existing distribution matrix from {0}'.format(distrib_matrix_filename), logging.INFO)
410 |             distrib_matrix = dict()
411 |             with open(distrib_matrix_filename, 'rt') as f:
412 |                 csvrreader = csv.DictReader(f, delimiter=' ', lineterminator=self.linesep)
413 |                 for row in csvrreader:
414 |                     distrib_matrix.update({row['w']: row['P(w|M)']})
415 |                 f.close()
416 |         else:  # create matrix and save file
417 |             self.log('Creating new distribution matrix into {0}. Please wait, this may take some time'.
418 |                      format(distrib_matrix_filename), logging.INFO)
419 |             distrib_matrix = FreqDist(stems)
420 | 
421 |             with open(distrib_matrix_filename, 'wt') as f:
422 |                 writer = csv.DictWriter(f, fieldnames=['w', 'P(w|M)'], delimiter=' ', lineterminator=self.linesep)
423 |                 writer.writeheader()
424 |                 for k in distrib_matrix.keys():
425 |                     writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]})
426 |                 f.close()
427 | 
428 |         distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix, cutoff=1)
429 |         return distrib_matrix
430 | 
431 |     @staticmethod
432 |     def reduce_distribution_matrix(matrix, cutoff=1):
433 |         reduced = {key: value for key, value in matrix.items() if int(value) > cutoff}
434 |         return reduced
435 | 
436 |     def save_csv(self, threads):
437 |         fout = '{0}_features.csv'.format(self.db_name)
438 |         self.log('Saving features into {0}'.format(fout), logging.INFO)
439 |         csvf = open(fout, 'wt')
440 |         fields = ('resolved', 'question_uid', 'answers_count', 'answer_uid',
441 |                   'date_time', 'time_difference', 'time_difference_rank', 'solution', 'len', 'len_rank', 'wordcount',
442 |                   'wordcount_rank', 'avg_chars_per_word', 'avg_chars_per_word_rank', 'sentences', 'sentences_rank',
443 |                   'avg_words_per_sentence', 'avg_words_per_sentence_rank', 'longest_sentence', 'longest_sentence_rank',
444 |                   'views', 'views_rank', 'loglikelihood', 'loglikelihood_ascending_rank',
445 |                   'loglikelihood_descending_rank', 'F-K', 'F-K_ascending_rank', 'F-K_descending_rank', 'upvotes',
446 |                   'upvotes_rank', 'has_links', 'has_code_snippet', 'has_tags')
447 |         writer = csv.DictWriter(csvf, dialect=csv.excel, fieldnames=fields, delimiter=';', lineterminator=self.linesep)
448 |         writer.writeheader()
449 |         # empty_line = dict.fromkeys(fields)
450 |         for t in threads:
451 |             row = dict()
452 |             row.fromkeys(fields)
453 |             answers = t['answers']
454 |             # question with no answers are excluded
455 |             i = 0
456 |             for a in answers:
457 |                 i += 1
458 |                 if i == 1:
459 |                     row['resolved'] = t['resolved']
460 |                     row['question_uid'] = t['question_uid']
461 |                     if len(t['tags']) > 0:
462 |                         row['has_tags'] = True
463 |                     else:
464 |                         row['has_tags'] = False
465 |                 else:
466 |                     row['resolved'] = ''
467 |                     row['question_uid'] = ''
468 |                 row['answers_count'] = t['answers_count']
469 |                 row['answer_uid'] = a['uid']
470 |                 row['time_difference'] = a['time_difference']
471 |                 row['time_difference_rank'] = a['time_difference_rank']
472 |                 if a['resolve'] == 'solution':
473 |                     row['solution'] = True
474 |                 else:
475 |                     row['solution'] = False
476 |                 row['len'] = a['len']
477 |                 row['len_rank'] = a['len_rank']
478 |                 row['wordcount'] = a['wordcount']
479 |                 row['wordcount_rank'] = a['wordcount_rank']
480 |                 row['avg_chars_per_word'] = a['avg_chars_per_word']
481 |                 row['avg_chars_per_word_rank'] = a['avg_chars_per_word_rank']
482 |                 row['sentences'] = a['sentences']
483 |                 row['sentences_rank'] = a['sentences_rank']
484 |                 row['avg_words_per_sentence'] = a['avg_words_per_sentence']
485 |                 row['avg_words_per_sentence_rank'] = a['avg_words_per_sentence_rank']
486 |                 row['longest_sentence'] = a['longest_sentence']
487 |                 row['longest_sentence_rank'] = a['longest_sentence_rank']
488 |                 row['views'] = a['views']
489 |                 row['views_rank'] = a['views_rank']
490 |                 row['loglikelihood'] = a['loglikelihood']
491 |                 row['loglikelihood_descending_rank'] = a['loglikelihood_descending_rank']
492 |                 row['loglikelihood_ascending_rank'] = a['loglikelihood_ascending_rank']
493 |                 row['F-K'] = a['F-K']
494 |                 row['F-K_descending_rank'] = a['F-K_descending_rank']
495 |                 row['F-K_ascending_rank'] = a['F-K_ascending_rank']
496 |                 row['upvotes'] = a['upvotes']
497 |                 row['upvotes_rank'] = a['upvotes_rank']
498 |                 row['has_links'] = a['has_links']
499 |                 row['has_code_snippet'] = a['has_code_snippet']
500 |                 row['date_time'] = a['date_time']
501 |                 writer.writerow(row)
502 |             #writer.writerow(empty_line)
503 |         csvf.close()


--------------------------------------------------------------------------------
/yahoo-answers/discretizer/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from discretizer import Discretizer
 5 | 
 6 | def main():
 7 |     program_name = os.path.basename(sys.argv[0])
 8 |     #Database name
 9 |     db_files = {'yahoo': 'no_date_database.pdl'}
10 |     try:
11 |         db_names = sys.argv[1]
12 |     except IndexError:
13 |         raise Exception('No db name. Please, re-run as {0} dbname.pdl'.format(program_name))
14 | 
15 |     if db_names == 'all':
16 |         discretizer = Discretizer(db_names, db_files)
17 |     else:
18 |         try:
19 |             discretizer = Discretizer(db_names, {db_names: db_files.get(db_names)})
20 |         except KeyError:
21 |             raise Exception('Invalid db name {0}. Please, check the name and re-run.'.format(db_names))
22 | 
23 |     discretizer.load_db(check=False, fix=False, save_to_file=False)
24 | 
25 |     corpus = discretizer.build_corpus()
26 |     stems = discretizer.build_stems(corpus)
27 |     stemmed_vocabulary = discretizer.build_vocabulary(stems)
28 |     distib_matrix = discretizer.build_distribution_matrix(stems)
29 | 
30 |     # grouping
31 |     threads = discretizer.load_threads()
32 |     # discretization and sorting
33 |     threads = discretizer.compute_features(threads, stemmed_vocabulary, distib_matrix)
34 |     discretizer.save_csv(threads)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     sys.exit(main())
39 |     """db = Base('dotnet-v1.pydb', save_to_file=False)
40 |     db.open()
41 |     #recs = [r for r in db if r('type') == 'question' and r('answers') > 0]
42 |     rec = (db("type") == 'question') & (db("answers") > 0)
43 |     print len(rec)"""
44 | 
45 | 


--------------------------------------------------------------------------------
/yahoo-answers/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | backports.ssl-match-hostname==3.4.0.2
 3 | certifi==2015.9.6.2
 4 | cffi==1.2.1
 5 | characteristic==14.3.0
 6 | ChatterBot==0.2.5
 7 | cryptography==1.0.1
 8 | cssselect==0.9.1
 9 | decorator==4.0.2
10 | enum34==1.0.4
11 | funcsigs==0.4
12 | functools32==3.2.3.post2
13 | fuzzywuzzy==0.6.2
14 | gnureadline==6.3.3
15 | html2text==2015.6.21
16 | idna==2.0
17 | ipaddress==1.0.14
18 | ipykernel==4.0.3
19 | ipython==4.0.0
20 | ipython-genutils==0.1.0
21 | ipywidgets==4.0.3
22 | Jinja2==2.8
23 | jsondatabase==0.0.6
24 | jsonschema==2.5.1
25 | jupyter==1.0.0
26 | jupyter-client==4.0.0
27 | jupyter-console==4.0.2
28 | jupyter-core==4.0.6
29 | lxml==3.5.0b1
30 | MarkupSafe==0.23
31 | matplotlib==1.4.3
32 | mechanize==0.2.5
33 | mistune==0.7.1
34 | mock==1.3.0
35 | nbconvert==4.0.0
36 | nbformat==4.0.0
37 | nltk==3.1
38 | nose==1.3.7
39 | notebook==4.0.5
40 | numpy==1.10.1
41 | oauthlib==1.0.3
42 | parsedatetime==1.5
43 | path.py==8.1.1
44 | pbr==1.8.1
45 | pexpect==3.3
46 | pickleshare==0.5
47 | ptyprocess==0.5
48 | pyasn1==0.1.8
49 | pyasn1-modules==0.0.7
50 | pycparser==2.14
51 | PyDbLite==3.0.2
52 | Pygments==2.0.2
53 | pyOpenSSL==0.15.1
54 | pyparsing==2.0.3
55 | python-dateutil==2.4.2
56 | pytz==2015.6
57 | pyzmq==14.7.0
58 | qtconsole==4.0.1
59 | queuelib==1.4.2
60 | requests==2.7.0
61 | requests-oauthlib==0.5.0
62 | Scrapy==1.0.3
63 | selenium==2.47.3
64 | service-identity==14.0.0
65 | simplegeneric==0.8.1
66 | six==1.9.0
67 | stripogram==1.5
68 | terminado==0.5
69 | tornado==4.2.1
70 | traitlets==4.0.0
71 | Twisted==15.4.0
72 | virtualenv==13.1.2
73 | w3lib==1.12.0
74 | wheel==0.24.0
75 | zope.interface==4.1.2
76 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # sh yahooscraper.sh 
 3 | 
 4 | if [ -z "$1" ]
 5 | then
 6 | 	echo "ERROR you must enter one arg related to the Yahoo URL DB use -h for Help"
 7 | else
 8 | 	if [ "$1" = "-h" ]
 9 | 	then
10 | 		echo "This script need the name of the database containing question URLs"
11 | 		echo "- sh yahooscraper.sh <dbname> "
12 | 	else
13 | 	echo "Reading from $1 database "
14 | 	cd yahooscraper/yahooscraper/yahooscraper/spiders
15 | 	scrapy crawl yahoo -o question-and-answer-report.json -a database_name=$1
16 | 	fi
17 | fi


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = yahooscraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = yahooscraper
12 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/__init__.py


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class YahooItem(scrapy.Item):
12 |     uid = scrapy.Field()
13 |     type = scrapy.Field()
14 |     author = scrapy.Field()
15 |     title = scrapy.Field()
16 |     text = scrapy.Field()
17 |     date_time = scrapy.Field()
18 |     tags = scrapy.Field()
19 |     views = scrapy.Field()
20 |     answers = scrapy.Field()
21 |     resolve = scrapy.Field()
22 |     upvotes = scrapy.Field()
23 |     url = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | from pydblite import Base
 2 | from scrapy.xlib.pydispatch import dispatcher
 3 | from scrapy import signals
 4 | import codecs
 5 | import datetime
 6 |  
 7 | class DBPipeline(object):
 8 |     def __init__(self):
 9 | 
10 |         #Creating log file
11 |         filename = "session_log.txt"
12 |         self.log_target = codecs.open(filename, 'a+', encoding='utf-8')
13 |         self.log_target.truncate()
14 |         self.log_target.write("***New session started at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + " ***" +"\n")
15 | 
16 |         #Creating database for items
17 |         self.db = Base('QuestionThreadExtracted.pdl')
18 |         self.db.create('uid', 'type', 'author', 'title', 'text', 'date_time',
19 |                        'tags', 'views', 'answers', 'resolve', 'upvotes', 'url', mode="open")
20 | 
21 |         #Some data for the log file
22 |         self.number_of_questions = 0
23 |         self.number_of_answers = 0
24 |         self.last_id=0
25 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
26 | 
27 |  
28 |     def process_item(self, item, spider):
29 | 
30 |         self.db.insert(uid=item['uid'],
31 |                        type=item['type'],
32 |                        author=item['author'],
33 |                        title=item['title'],
34 |                        text=item['text'],
35 |                        date_time=item['date_time'],
36 |                        tags=item['tags'],
37 |                        views=item['views'],
38 |                        answers=item['answers'],
39 |                        resolve=item['resolve'],
40 |                        upvotes=item['upvotes'],
41 |                        url=item['url']
42 |                        )
43 |         #Count questions and answers
44 |         if "question" in item['type']:
45 |             self.number_of_questions+=1
46 |             if self.last_id<item['uid']:
47 |                 self.last_id=item['uid']
48 |         else:
49 |             self.number_of_answers+=1
50 | 
51 | 
52 |         self.db.commit()
53 |         return item
54 | 
55 |     def spider_closed(self, spider):
56 |         self.log_target.write("Questions founded: "+ str(self.number_of_questions) + "\n")
57 |         self.log_target.write("Answers founded: "+ str(self.number_of_answers) + "\n")
58 |         self.log_target.write("Last UID: "+str(self.last_id) + "\n" + "\n")
59 | 
60 | 
61 |         self.log_target.write("***Session End at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + " ***" +"\n")
62 | 
63 |         self.log_target.close()
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for yahooscraper project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'yahooscraper'
13 | 
14 | SPIDER_MODULES = ['yahooscraper.spiders']
15 | NEWSPIDER_MODULE = 'yahooscraper.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'yahooscraper (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=500
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | DOWNLOAD_DELAY=0.05
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'yahooscraper.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'yahooscraper.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | ITEM_PIPELINES = {
65 |     'yahooscraper.pipelines.DBPipeline': 300,
66 | }
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | #AGGIUNTO
87 | LOG_LEVEL = 'ERROR'


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/spiders/YahooScraper.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import html2text
  3 | from scrapy.selector import HtmlXPathSelector
  4 | from selenium.common.exceptions import NoSuchElementException
  5 | import scrapy
  6 | from pydblite import Base
  7 | from ..items import YahooItem
  8 | import sys
  9 | import parsedatetime as pdt
 10 | 
 11 | # This class contains element related to question thread URL
 12 | # and question date insertion
 13 | class UrlDate():
 14 |     def __init__(self, url, date):
 15 |         self.url = url
 16 |         self.date = date
 17 | 
 18 | 
 19 | class YahooScraper(scrapy.Spider):
 20 |     # This is the start uid related to question thread
 21 |     uid = 0
 22 |     url_to_scrape = []
 23 |     # Name of this spider
 24 |     name = "yahoo"
 25 |     allowed_domains = ["yahoo.com"]
 26 |     start_urls = ["https://answers.yahoo.com/dir/index/discover?sid=396545663"]
 27 |     BASE_URL = 'https://answers.yahoo.com/question'
 28 | 
 29 | 
 30 |     def __init__(self, database_name=None):
 31 |         print ("Opening " + database_name)
 32 |         db_r = Base(database_name)
 33 |         # Choose the DB of the Question Thread URL
 34 |         db_r.create('url', 'date', mode="open")
 35 |         # Check if the DB is empty or new
 36 |         if len(db_r)==0:
 37 |             print ("ERROR: Database not found or empty")
 38 |             sys.exit()
 39 |         else:
 40 |             print ("Database elements: " + str(len(db_r)))
 41 |             for r in db_r:
 42 |                 self.url_to_scrape.append(UrlDate(r["url"], r["date"]))
 43 |             # Making a SET of the Database in order to delete duplicate URLS
 44 |             self.url_to_scrape = {x.url: x for x in self.url_to_scrape}.values()
 45 |             print ("Database elements after set operation: " + str(len(db_r)))
 46 | 
 47 |     def parse(self, response):
 48 |         # Send scrapy scrape request for any question thread
 49 |         print ("Start the scraping process from the URL database...")
 50 |         for any_url in self.url_to_scrape:
 51 |             yield scrapy.Request(any_url.url, callback=self.parse_page)
 52 | 
 53 |     def parse_page(self, response):
 54 |         # Time tools
 55 |         c = pdt.Constants()
 56 |         p = pdt.Calendar(c)
 57 |         f = '%Y-%m-%d %H:%M:%S'
 58 |         now = datetime.datetime.now()
 59 |         # Start to scraping a single question
 60 | 
 61 |         #Checking question category
 62 |         try:
 63 |             hxs = HtmlXPathSelector(response)
 64 |             category = hxs.xpath(
 65 |                 '(//a[contains(@class,"Clr-b")])[2]').extract()
 66 |             h = html2text.HTML2Text()
 67 |             h.ignore_links = True
 68 |             category_text = h.handle(category[0])
 69 |             url_category = str(category_text).strip()
 70 |         except IndexError:
 71 |             print (str(self.uid) + "Warning: this Url is not more available...")
 72 |             url_category = "Error"
 73 | 
 74 |         # If the question is related to programming and design
 75 |         # start item creation process
 76 |         if "Programming" and "Design" in url_category:
 77 |             # increment id
 78 |             # copy id and use uid_copy in order to preserve from concurrent request
 79 |             self.uid = self.uid + 1
 80 |             uid_copy = self.uid
 81 | 
 82 |             # Print current uid any 100 times
 83 |             if self.uid % 100 == 0:
 84 |                 print (str(self.uid))
 85 |             # Initialize scrapy item
 86 |             item = YahooItem()
 87 |             # Read in the date field associated to URL if info data are present
 88 |             for istance in self.url_to_scrape:
 89 |                 if response.url == istance.url:
 90 |                     if istance.date == "not available":
 91 |                         item['date_time'] = "not available"
 92 |                         break
 93 |                     else:
 94 |                         data_format = p.parseDT(str(
 95 |                             str(istance.date).replace("\xc2\xb7", "").strip()))
 96 |                         item['date_time'] = data_format[0].strftime(f)
 97 |                         break
 98 |             item['type'] = "question"
 99 |             item['uid'] = uid_copy
100 |             item['url'] = response.url
101 |             item['tags'] = "N/A"
102 |             item['views'] = 0
103 |             item['upvotes'] = 0
104 |             text_to_gain = hxs.xpath('//h1').extract()
105 |             # Take title of the question
106 |             item['title'] = (
107 |             html2text.html2text(text_to_gain[0]).encode("utf8").strip())
108 |             # Take text from the question
109 |             full_text_answer = hxs.xpath(
110 |                 '//span[contains(@class,"ya-q-full-text Ol-n")]').extract()
111 |             if full_text_answer:
112 |                 item['text'] = html2text.html2text(full_text_answer[0]).encode(
113 |                     'utf-8', 'ignore')
114 |             else:
115 |                 text_to_gain = hxs.xpath(
116 |                     '//span[contains(@class,"ya-q-text")]').extract()
117 |                 if text_to_gain:
118 |                     item['text'] = html2text.html2text(text_to_gain[0]).encode(
119 |                         'utf-8', 'ignore')
120 |             # Take username of the questioner
121 |             text_to_gain = hxs.xpath(
122 |                 '//div[contains(@id,"yq-question-detail-profile-img")]'+
123 |                 '/a/img/@alt').extract()
124 |             if text_to_gain:
125 |                 try:
126 |                     h = html2text.HTML2Text()
127 |                     h.ignore_links = True
128 |                     author_string = h.handle(text_to_gain[0])
129 |                     item['author'] = author_string.encode('utf-8',
130 |                                                           'ignore').strip()
131 |                 # Handle HTMLtoText except
132 |                 except:
133 |                     item['author'] = "anonymous"
134 |             else:
135 |                 item['author'] = "anonymous"
136 |             text_to_gain = hxs.xpath(
137 |                 '(//div[contains(@class,"Mend-10 Fz-13 Fw-n D-ib")])'+
138 |                 '[2]/span[2]').extract()
139 |             # Read number of answers
140 |             if text_to_gain:
141 |                 if " answers" in (
142 |                 str(html2text.html2text(text_to_gain[0])).strip()):
143 |                     item['answers'] = int(
144 |                         str(html2text.html2text(text_to_gain[0])).replace(
145 |                             " answers", "").strip())
146 |                 else:
147 |                     if " answer" in (
148 |                     str(html2text.html2text(text_to_gain[0])).strip()):
149 |                         item['answers'] = int(
150 |                             str(html2text.html2text(text_to_gain[0])).replace(
151 |                                 " answer", "").strip())
152 |             else:
153 |                 item['answers'] = 0
154 |             # Check if question is closed (resolve with a best answer)
155 |             text_to_gain = hxs.xpath(
156 |                 '//span[contains(@class,"ya-ba-title Fw-b")]/text()').extract()
157 |             if text_to_gain:
158 |                 item['resolve'] = "True"
159 |             else:
160 |                 item['resolve'] = "False"
161 | 
162 |             # yield item for the question istance
163 |             yield item
164 | 
165 |             # Taking the best answer if present
166 | 
167 |             if hxs.xpath('//div[contains(@id,"ya-best-answer")]'):
168 |                 ans_uid = 1
169 |                 item = YahooItem()
170 |                 ans_data = hxs.xpath(
171 |                     '(//div[contains(@class,"Pt-15")]/'+
172 |                     'span[contains(@class, "Clr-88")])[1]').extract()
173 |                 data_string = html2text.html2text(ans_data[0]).strip()
174 |                 data_format = p.parseDT(str(
175 |                     data_string.encode("utf8").replace("\xc2\xb7",
176 |                                                        "").strip()))
177 |                 item['date_time'] = data_format[0].strftime(f)
178 |                 item['uid'] = str(str(uid_copy) + ("." + str(ans_uid)))
179 |                 item['type'] = "answer"
180 |                 item['resolve'] = "solution"
181 |                 item['tags'] = "N/A"
182 |                 item['title'] = ""
183 |                 item['answers'] = 0
184 |                 item['views'] = 0
185 |                 best_text = hxs.xpath(
186 |                     '(//span[contains(@class,"ya-q-full-text")])[1]').extract()
187 |                 item['text'] = html2text.html2text(best_text[0]).encode(
188 |                     'utf-8', 'ignore')
189 |                 text_to_gain = hxs.xpath(
190 |                     '(//a[contains(@class,"uname Clr-b")])[1]').extract()
191 |                 if text_to_gain:
192 |                     h = html2text.HTML2Text()
193 |                     h.ignore_links = True
194 |                     author_string = h.handle(text_to_gain[0])
195 |                     item['author'] = str(
196 |                         author_string.encode('utf-8', 'ignore').strip())
197 |                 else:
198 |                     item['author'] = "anonymous"
199 |                 upvote_text = hxs.xpath(
200 |                     '(//div[contains(@class,"D-ib Mstart-23 count")])[1]/text()').extract()
201 |                 item['upvotes'] = int(
202 |                     str(html2text.html2text(upvote_text[0])).strip())
203 |                 item['url'] = response.url
204 |                 ans_uid = ans_uid + 1
205 |                 yield item
206 | 
207 |             else:
208 |                 ans_uid = 1
209 | 
210 | 
211 |             # Taking all the other answers
212 |             all_answer = hxs.xpath('//ul[contains(@id,"ya-qn-answers")]/li')
213 |             for single_answer in all_answer:
214 |                 item = YahooItem()
215 |                 # In this case data is always present
216 |                 ans_data = single_answer.xpath(
217 |                     './/div[contains(@class,"Pt-15")]/span[contains(@class, "Clr-88")]').extract()
218 |                 data_string = html2text.html2text(ans_data[0])
219 |                 data_format = p.parseDT(str(
220 |                     data_string.encode("utf8").replace("\xc2\xb7",
221 |                                                        "").strip()))
222 |                 item['date_time'] = data_format[0].strftime(f)
223 |                 item['uid'] = str(str(uid_copy) + ("." + str(ans_uid)))
224 |                 item['tags'] = "N/A"
225 |                 item['title'] = ""
226 |                 item['answers'] = 0
227 |                 item['views'] = 0
228 |                 item['type'] = "answer"
229 |                 item['resolve'] = ""
230 |                 text_to_gain = single_answer.xpath(
231 |                     './/a[contains(@class,"uname Clr-b")]').extract()
232 |                 if text_to_gain:
233 |                     h = html2text.HTML2Text()
234 |                     h.ignore_links = True
235 |                     author_string = h.handle(text_to_gain[0])
236 |                     item['author'] = str(
237 |                         author_string.encode('utf-8', 'ignore'))
238 |                 else:
239 |                     item['author'] = "anonymous"
240 |                 # Take url of the question becouse answer don't have URL ref
241 |                 item['url'] = response.url
242 |                 # Check if is present long text version of the answer
243 |                 text_to_gain = single_answer.xpath(
244 |                     './/span[contains(@class,"ya-q-full-text")][@itemprop="text"]').extract()
245 |                 if text_to_gain:
246 |                     item['text'] = html2text.html2text(text_to_gain[0]).encode(
247 |                         'utf-8', 'ignore')
248 |                 else:
249 |                     item['text'] = ""
250 | 
251 |                 text_to_gain = single_answer.xpath(
252 |                     './/div[contains(@class,"D-ib Mend-10 Clr-93")]/div[1]/div[1]').extract()
253 |                 if text_to_gain:
254 |                     item['upvotes'] = int(
255 |                         str(html2text.html2text(text_to_gain[0])).strip())
256 |                 else:
257 |                     item['upvotes'] = 0
258 | 
259 |                 ans_uid = ans_uid + 1
260 |                 yield item
261 |             # Checking if there are more then 10 answers
262 |             # in this case there are other answers in other page
263 |             try:
264 |                 if (hxs.xpath(
265 |                         '//div[contains(@id, "ya-qn-pagination")]'+
266 |                         '/a[contains(@class,"Clr-bl")][last()]/@href')):
267 |                     url_of_the_next_page = hxs.xpath(
268 |                         '//div[contains(@id, "ya-qn-pagination")]'+
269 |                         '/a[contains(@class,"Clr-bl")][last()]/@href').extract()
270 |                     next_page_composed = "https://answers.yahoo.com" + \
271 |                                          url_of_the_next_page[0]
272 |                     # Go to the next page and take more urls
273 |                     # passing uid as parameter
274 |                     request = scrapy.Request(next_page_composed,
275 |                                              meta={'ans_id': uid_copy},
276 |                                              callback=self.parse_other_answer_page)
277 |                     request.meta['quest_id'] = uid_copy
278 |                     request.meta['ult_ans_id'] = ans_uid
279 |                     yield request
280 |             except NoSuchElementException:
281 |                 pass
282 |         else:
283 |             print (str(self.uid) + " question not available or not related")
284 |             print(str(response.url))
285 | 
286 |     # This method is used when question have more then 10 answer and usesed page number
287 |     # works like the simple parse of a question because page and xpath are still the same
288 |     def parse_other_answer_page(self, response):
289 |         c = pdt.Constants()
290 |         p = pdt.Calendar(c)
291 |         f = '%Y-%m-%d %H:%M:%S'
292 |         hxs = HtmlXPathSelector(response)
293 |         all_answer = hxs.xpath('//ul[contains(@id,"ya-qn-answers")]/li')
294 |         current_ans_id = response.meta['ult_ans_id']
295 |         for single_answer in all_answer:
296 |             item = YahooItem()
297 |             ans_data = single_answer.xpath(
298 |                 './/div[contains(@class,"Pt-15")]/span[contains(@class, "Clr-88")]').extract()
299 |             data_string = html2text.html2text(ans_data[0])
300 |             data_format = p.parseDT(str(
301 |                 data_string.encode("utf8").replace("\xc2\xb7", "").strip()))
302 |             item['date_time'] = data_format[0].strftime(f)
303 |             item['uid'] = str(
304 |                 str(response.meta['quest_id']) + "." + str(current_ans_id))
305 |             item['type'] = "answer"
306 |             item['tags'] = "N/A"
307 |             item['title'] = ""
308 |             item['resolve'] = ""
309 |             item['answers'] = 0
310 |             item['views'] = 0
311 |             text_to_gain = single_answer.xpath(
312 |                 './/a[contains(@class,"uname Clr-b")]').extract()
313 |             if text_to_gain:
314 |                 h = html2text.HTML2Text()
315 |                 h.ignore_links = True
316 |                 author_string = h.handle(text_to_gain[0])
317 |                 item['author'] = str(
318 |                     author_string.encode('utf-8', 'ignore').strip())
319 |             else:
320 |                 item['author'] = "anonymous"
321 | 
322 |             item['url'] = response.url
323 | 
324 |             text_to_gain = single_answer.xpath(
325 |                 './/span[contains(@class,"ya-q-full-text")][@itemprop="text"]').extract()
326 |             if text_to_gain:
327 |                 item['text'] = html2text.html2text(text_to_gain[0]).encode(
328 |                     'utf-8', 'ignore')
329 |             else:
330 |                 item['text'] = ""
331 | 
332 |             text_to_gain = single_answer.xpath(
333 |                 './/div[contains(@class,"D-ib Mend-10 Clr-93")]/div[1]/div[1]').extract()
334 |             if text_to_gain:
335 |                 item['upvotes'] = int(html2text.html2text(text_to_gain[0]))
336 |             else:
337 |                 item['upvotes'] = 0
338 | 
339 |             current_ans_id = current_ans_id + 1
340 |             yield item
341 | 
342 |         try:
343 |             if (hxs.xpath(
344 |                     '//div[contains(@id, "ya-qn-pagination")]'+
345 |                     '/a[contains(@class,"Clr-bl")][last()]/@href')):
346 |                 url_of_the_next_page = hxs.xpath(
347 |                     '//div[contains(@id, "ya-qn-pagination")]'+
348 |                     '/a[contains(@class,"Clr-bl")][last()]/@href').extract()
349 |                 next_page_composed = "https://answers.yahoo.com" + \
350 |                                      url_of_the_next_page[0]
351 |                 request = scrapy.Request(next_page_composed,
352 |                                          callback=self.parse_other_answer_page)
353 |                 request.meta['quest_id'] = response.meta['quest_id']
354 |                 request.meta['ult_ans_id'] = current_ans_id
355 |                 yield request
356 |         except NoSuchElementException:
357 |             pass
358 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/spiders/example_database.pdl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/yahoo-answers/yahooscraper/yahooscraper/yahooscraper/spiders/example_database.pdl


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd yahoourlextractor/YahooUrlSearcher/spiders
3 | scrapy crawl YahooUrlSearcher -o yahoo-url-file.json


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collab-uniba/qa-scrapers/b26ece3f210d3dcdfd7f2045193e3258cae5b4b4/yahoo-answers/yahoourlextractor/YahooUrlSearcher/__init__.py


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class YahoourlsearcherItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     url = scrapy.Field()
15 |     date = scrapy.Field()
16 | 
17 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import time
 8 | import datetime
 9 | import codecs
10 | from pydblite import Base
11 | from scrapy.xlib.pydispatch import dispatcher
12 | from scrapy import signals
13 | 
14 | class UrlDate():
15 |     def __init__(self, url, date):
16 |         self.url=url
17 |         self.date=date
18 | 
19 | class YahoourlsearcherPipeline(object):
20 |     def open_spider(self, spider):
21 | 
22 |         filename = "urls_log.txt"
23 |         self.log_target = codecs.open(filename, 'a+', encoding='utf-8')
24 |         self.log_target.truncate()
25 | 
26 |         self.db = Base('URL_database.pdl')
27 |         self.db.create('url', 'date', mode="open")
28 |         self.log_target.write("***New url scraping session started at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + " ***" +"\n")
29 |         print("***New url scraping session started at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + " ***" +"\n")
30 |         self.log_target.write("*** Total url in the Database BEFORE new search: "+ str(len(self.db)) + " ***" + "\n")
31 | 
32 | 
33 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
34 | 
35 | 
36 |     def process_item(self, item, spider):
37 |         self.db.insert(url=item['url'],
38 |                        date=item['date']
39 |                        )
40 |         self.log_target.write(item['url'] + "  " + item['date'] + "\n")
41 |         self.db.commit()
42 |         return item
43 | 
44 |     def spider_closed(self, spider):
45 |         url_structure = []
46 |         print ("End of database")
47 |         i = 1
48 |         for r in self.db:
49 |             #print (str(r["url"]) + " " + str(r["date"]) + " \n")
50 |             url_structure.append(url_date(r["url"],r["date"]))
51 |             i += 1
52 |         print (str(i) + "Url in the DB \n")
53 |         self.log_target.write("Session ends at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + "\n")
54 |         print ("Session ends at: "+ str(datetime.datetime.strftime(datetime.datetime.now(), ' %Y-%m-%d %H:%M:%S ')) + "\n")
55 |         self.log_target.write("*** Total url in the Database AFTER the search: "+ str(len(self.db)) + " ***" + "\n")
56 | 
57 |         print ("Elementi presenti nel database: "+ str(len(self.db)) + " in struttura: " + str(len(url_structure)))
58 |         all_record = []
59 |         for r in self.db:
60 |             all_record.append(r)
61 |         self.db.delete(all_record)
62 |         print ("Elementi presenti nel database: "+ str(len(self.db)))
63 | 
64 |         #set qui
65 |         url_structure = {x.url: x for x in url_structure}.values()
66 | 
67 | 
68 |         for any_url in url_structure:
69 |             self.db.insert(any_url.url, any_url.date)
70 | 
71 | 
72 |         print ("Elementi presenti nel database: "+ str(len(self.db)))
73 |         self.db.commit()
74 |         self.log_target.write("--- After SET operation: "+ str(len(self.db)) + " --- " + "\n" + "\n" + "\n" + "\n")
75 | 
76 |         self.log_target.close()
77 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for YahooUrlSearcher project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'YahooUrlSearcher'
13 | 
14 | SPIDER_MODULES = ['YahooUrlSearcher.spiders']
15 | NEWSPIDER_MODULE = 'YahooUrlSearcher.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'YahooUrlSearcher (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | DOWNLOAD_DELAY=0.05
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'YahooUrlSearcher.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'YahooUrlSearcher.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | ITEM_PIPELINES = {
65 |     'YahooUrlSearcher.pipelines.YahoourlsearcherPipeline': 300,
66 | }
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=0.05
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | LOG_LEVEL = 'WARNING'
87 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/YahooUrlSearcher/spiders/yahoourlspider.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import html2text
  4 | from selenium.common.exceptions import TimeoutException
  5 | import scrapy
  6 | from scrapy.selector import HtmlXPathSelector
  7 | from selenium.common.exceptions import NoSuchElementException
  8 | from selenium import webdriver
  9 | from pydblite import Base
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.webdriver.support.ui import WebDriverWait
 12 | from selenium.webdriver.support import expected_conditions as EC
 13 | import platform
 14 | from ..items import YahoourlsearcherItem
 15 | 
 16 | 
 17 | class MySpider(scrapy.Spider):
 18 |     uid = 0
 19 |     url_to_scrape = []
 20 |     name = "YahooUrlSearcher"
 21 |     allowed_domains = ["yahoo.com"]
 22 |     # First URL for the scrapy request
 23 |     # In this case Programming and Design category of Yahoo Answer
 24 |     start_urls = ["https://answers.yahoo.com/dir/index/discover?sid=396545663"]
 25 |     # Domain
 26 |     BASE_URL = 'https://answers.yahoo.com/question'
 27 | 
 28 |     def __init__(self):
 29 |         # Select the webdriver in order to use web automation with Selenium
 30 |         if platform.system() == "Windows":
 31 |             self.driver = webdriver.PhantomJS()
 32 |         else:
 33 |             self.driver = webdriver.PhantomJS(executable_path='./phantomjs')
 34 | 
 35 |     def parse(self, response):
 36 |         # Open the browser and load the first url from start_urls
 37 |         self.driver.get(response.url)
 38 |         time.sleep(2)
 39 |         # Print user agent info
 40 |         agent = self.driver.execute_script("return navigator.userAgent")
 41 |         print (agent)
 42 |         self.driver.refresh()
 43 |         time.sleep(2)
 44 | 
 45 |         # Getting the top answerers list from this category
 46 |         user_list = self.driver.find_elements_by_xpath(
 47 |             '//table[contains(@class,"W-100 Bc-c")]/tbody/tr')
 48 |         lnks = [i.find_element_by_xpath('.//td/a').get_attribute('href') for i
 49 |                 in user_list]
 50 | 
 51 |         # For any user in the top answerers chart start the scrape of question URLs
 52 |         for single_user in lnks:
 53 |             # Load user profile page
 54 |             print ("Take URLs from user: "+str(single_user))
 55 |             self.driver.get(single_user)
 56 |             time.sleep(5)
 57 |             # Click on the Answers tab of the user's profile
 58 |             try:
 59 |                 self.driver.find_element_by_xpath(
 60 |                     '//div[contains(@id,"ya-tabs-main")]/div[2]/a').click()
 61 |             except NoSuchElementException:
 62 |                 pass
 63 | 
 64 |             # Set max time in order to wait the loading label
 65 |             wait = WebDriverWait(self.driver, 6)
 66 |             try:
 67 |                 new_position_in_page = self.driver.find_element_by_id(
 68 |                     "ya-infinite-scroll-message").location
 69 |             except NoSuchElementException:
 70 |                 pass
 71 |             # Loop in order to do the scroll of the page untill loading label is show
 72 |             while True:
 73 |                 # do the scrolling
 74 |                 # make some scrollDown and check position in order to prevent infinite loop
 75 |                 # sometimes the server freeze and still show loading label
 76 |                 try:
 77 |                     old_position = self.driver.find_element_by_id(
 78 |                         "ya-infinite-scroll-message").location
 79 |                     for i in range(0, 80):
 80 |                         # Javascript comand to scrolldown the page
 81 |                         self.driver.execute_script(
 82 |                             "window.scrollTo(0, document.body.scrollHeight);")
 83 |                         time.sleep(0.2)
 84 |                     new_position_in_page = self.driver.find_element_by_id(
 85 |                         "ya-infinite-scroll-message").location
 86 |                     # if position still remain the same after the scroll down break the while loop
 87 |                     if old_position == new_position_in_page:
 88 |                         break
 89 | 
 90 | 
 91 |                 except NoSuchElementException:
 92 |                     pass
 93 | 
 94 |                 try:
 95 |                     # Check if server print error label
 96 |                     if self.driver.find_element_by_id("ya-stream-error"):
 97 |                         break
 98 |                 except NoSuchElementException:
 99 |                     pass
100 |                 try:
101 |                     # Wait the loading of the loading label
102 |                     # If pass more time then until value throw TimeoutException
103 |                     wait.until(EC.visibility_of_element_located(
104 |                         (By.ID, "ya-infinite-scroll-message")))
105 |                 except TimeoutException:
106 |                     # User have no more answer
107 |                     # Stop the loop
108 |                     break
109 | 
110 |             time.sleep(5)
111 | 
112 |             try:
113 |                 # Take all Question Thread URLS in the page loaded
114 |                 post_elems = self.driver.find_elements_by_xpath(
115 |                     '//li[contains(@class,"qTile P-14 Bdbx-1g Bgc-w")]')
116 | 
117 |                 i = 0
118 |                 for post in post_elems:
119 |                     # Take date value
120 |                     date_value = post.find_element_by_xpath(
121 |                         './/div[contains(@class,"Fz-12 Clr-888")]').text
122 |                     match = re.search(r"(\d+ \w+ ago)$", date_value)
123 |                     # Take Question Thread URL
124 |                     url = post.find_element_by_xpath(
125 |                         './/h3/a[contains(@class,"Clr-b")]')
126 |                     url_accodare = url.get_attribute('href')
127 |                     try:
128 |                         # Check if the Question is related to programming & Design
129 |                         if (post.find_element_by_link_text(
130 |                                 'Programming & Design')):
131 |                             item = YahoourlsearcherItem()
132 |                             # Print date and url
133 |                             # print("User url: " + str(i))
134 |                             item['url'] = str(url_accodare)
135 |                             item['date'] = str(match.group(1)).strip()
136 |                             yield item
137 |                             i = i + 1
138 |                     except NoSuchElementException:
139 |                         pass
140 |                 print("Link take by this user: " + str(i))
141 |             except NoSuchElementException:
142 |                 print ("Error in user xpath selection with Selenium")
143 |                 pass
144 | 
145 |         time.sleep(5)
146 | 
147 | 
148 |         # Taking more elements from category main page
149 |         # Start the scrolling of the main category
150 |         print ("Start scraping process from the Yahoo programming and design homepage")
151 |         self.driver.get(response.url)
152 |         self.driver.refresh()
153 |         time.sleep(15)
154 | 
155 |         wait = WebDriverWait(self.driver, 5)
156 | 
157 |         # Start the scrolldown loop
158 |         while True:
159 | 
160 |             self.driver.execute_script(
161 |                 "window.scrollTo(0, document.body.scrollHeight);")
162 |             time.sleep(1)
163 | 
164 |             try:
165 |                 # Ends when
166 |                 wait.until(EC.visibility_of_element_located(
167 |                     (By.ID, "ya-infinite-scroll-message")))
168 |             except TimeoutException:
169 |                 print("END")
170 |                 break
171 | 
172 |         # Taking questions from the Yahoo category main page
173 |         # Make list of the Question Thread scraped in the page
174 |         post_elems = self.driver.find_elements_by_xpath(
175 |             '//div[contains(@class,"Bfc")]')
176 |         i = 0
177 |         for post in post_elems:
178 |             url = post.find_element_by_xpath('.//a')
179 |             # Take date value
180 |             date_value = post.find_element_by_xpath(
181 |                 './/div[contains(@class,"Clr-888 Fz-12 Lh-18")]').text
182 |             match = re.search(r"(\d+ \w+ ago)$", date_value)
183 |             # Take URL value
184 |             url_accodare = url.get_attribute('href')
185 |             print (url_accodare)
186 |             item = YahoourlsearcherItem()
187 |             item['url'] = str(url_accodare)
188 |             item['date'] = str(match.group(1)).strip()
189 |             yield item
190 |             i = i + 1
191 | 
192 |         print("Take "+str(i)+" urls from the mainpage")
193 |         print("Start other URLs crawling from the current URLs scraped")
194 |         print("...this will be take long time")
195 |         time.sleep(1)
196 |         # Open DB in order to start crawling of other question thread URL elements
197 |         self.db = Base('URL_database.pdl')
198 |         self.db.create('url', 'date', mode="open")
199 | 
200 |         i = 0
201 |         # For any URL in the DB
202 |         for r in self.db:
203 |             current_url = r["url"]
204 |             # Open the URL
205 |             self.driver.get(current_url)
206 |             print (str(i) + " [-] " + str(len(self.db)))
207 |             i = i + 1
208 |             # Find other URL by NEXT href label
209 |             try:
210 |                 next_page = self.driver.find_element_by_xpath(
211 |                     '//a[contains(@class,"Clr-b") and text()=" Next "]')
212 |                 composed_string = next_page.get_attribute("href")
213 |                 yield scrapy.Request(composed_string,
214 |                                      callback=self.other_question)
215 |                 try:
216 |                     element = WebDriverWait(self.driver, 10).until(
217 |                         EC.presence_of_element_located((By.ID, "Aside")))
218 |                     try:
219 |                         # Check if the Question Thread page
220 |                         # related questions are present
221 |                         if (self.driver.find_element_by_id(
222 |                                 "ya-related-questions")):
223 |                             try:
224 |                                 element = WebDriverWait(self.driver, 10).until(
225 |                                     EC.presence_of_element_located((By.ID,
226 |                                                                     "ya-related-questions-show-more")))
227 |                                 press_event = self.driver.find_element_by_xpath(
228 |                                     '//div[contains(@id,"ya-related-questions-show-more")]//a')
229 |                                 webdriver.ActionChains(
230 |                                     self.driver).move_to_element(
231 |                                     press_event).click(press_event).perform()
232 |                                 time.sleep(2)
233 |                             except NoSuchElementException:
234 |                                 pass
235 | 
236 |                             try:
237 |                                 post_elems = self.driver.find_elements_by_xpath(
238 |                                     '//div[contains(@id,"ya-related-questions")]'+
239 |                                     '//div[contains(@class,"qstn-title Fz-13 Fw-b Wow-bw")]')
240 |                                 for post in post_elems:
241 |                                     url_other_question = post.find_element_by_xpath(
242 |                                         './/a')
243 |                                     url_new = url_other_question.get_attribute(
244 |                                         'href')
245 |                                     yield scrapy.Request(url_new,
246 |                                                          callback=self.other_question)
247 |                             except NoSuchElementException:
248 |                                 print("Error for take url")
249 | 
250 |                     except NoSuchElementException:
251 |                         pass
252 |                 except TimeoutException:
253 |                     print ("Page not Available - redirecting to next page...")
254 |             except NoSuchElementException:
255 |                 pass
256 | 
257 |         self.driver.close()
258 | 
259 |     # This method is used to obtain recursion next page or other link related to question thread
260 |     def other_question(self, response):
261 |         try:
262 |             hxs = HtmlXPathSelector(response)
263 |             item = YahoourlsearcherItem()
264 |             category = hxs.xpath(
265 |                 '(//a[contains(@class,"Clr-b")])[2]').extract()
266 |             h = html2text.HTML2Text()
267 |             h.ignore_links = True
268 |             category_text = h.handle(category[0])
269 |             # Check if the question thread is related to programming and design
270 |             if "Programming" and "Design" in str(category_text).strip():
271 |                 next_page = hxs.xpath(
272 |                     '//a[contains(@class,"Clr-b") and text()=" Next "]/@href')\
273 |                     .extract()
274 |                 composed_string = "https://answers.yahoo.com" + next_page[0]
275 |                 item['url'] = str(response.url)
276 |                 item['date'] = str("not available")
277 |                 print ("*** " + str(category_text).strip() + " - " + item[
278 |                     'url'] + " ***")
279 |                 yield item
280 |                 yield scrapy.Request(composed_string,
281 |                                      callback=self.other_question)
282 |         except NoSuchElementException:
283 |             pass
284 | 


--------------------------------------------------------------------------------
/yahoo-answers/yahoourlextractor/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = YahooUrlSearcher.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = YahooUrlSearcher
12 | 


--------------------------------------------------------------------------------