├── trademarks
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── foreign_spider.py
    ├── pipelines.py
    ├── items.py
    └── settings.py
├── requirements.txt
├── .gitignore
├── README.md
└── scrapy.cfg


/trademarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy>=1.2.0
2 | 


--------------------------------------------------------------------------------
/trademarks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.pyc
 3 | *.swp
 4 | *.swo
 5 | .tox
 6 | *.egg-info
 7 | docs/_build
 8 | dist
 9 | .coverage
10 | build
11 | .proof
12 | .ipynb_checkpoints
13 | .idea
14 | __pycache__
15 | .scrapy
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # trademarks
 2 | 
 3 | Scraper for tech company trademarks filed in foreign nations
 4 | 
 5 | ## Setup
 6 | 
 7 | ```
 8 | mkvirtualenv trademarks
 9 | pip install -r requirements.txt
10 | ```
11 | 
12 | ## Running
13 | 
14 | ```
15 | rm foreign.csv && scrapy crawl foreign -o foreign.csv
16 | ```
17 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = trademarks.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = trademarks
12 | 


--------------------------------------------------------------------------------
/trademarks/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TrademarksPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/trademarks/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TrademarksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/trademarks/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for trademarks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'trademarks'
13 | 
14 | SPIDER_MODULES = ['trademarks.spiders']
15 | NEWSPIDER_MODULE = 'trademarks.spiders'
16 | 
17 | FEED_EXPORT_FIELDS = ['company', 'owner_name', 'mark', 'application_date', 'foreign_country', 'foreign_date', 'url', 'owner_address']
18 | 
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | #USER_AGENT = 'trademarks (+http://www.yourdomain.com)'
22 | 
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = False
25 | 
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | CONCURRENT_REQUESTS = 1
28 | 
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | #DOWNLOAD_DELAY = 3
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'trademarks.middlewares.MyCustomSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'trademarks.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | #    'trademarks.pipelines.SomePipeline': 300,
71 | #}
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | HTTPCACHE_ENABLED = True
89 | HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/trademarks/spiders/foreign_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import scrapy
 4 | from selenium import webdriver
 5 | from selenium.common.exceptions import NoSuchElementException
 6 | 
 7 | class ForeignSpider(scrapy.Spider):
 8 |     name = "foreign"
 9 | 
10 |     companies = [
11 |         'amazon',
12 |         'apple',
13 |         'google',
14 |         'microsoft',
15 |         'alphabet',
16 |         'ibm',
17 |         'intel'
18 |     ]
19 | 
20 |     start_urls = ['http://tess2.uspto.gov/']
21 | 
22 |     def __init__(self):
23 |         self.driver = webdriver.Firefox()
24 | 
25 |     def parse(self, response):
26 |         for company in self.companies:
27 |             self.driver.get(response.url)
28 | 
29 |             search_link = self.driver.find_element_by_link_text('Word and/or Design Mark Search (Free Form)')
30 | 
31 |             search_link.click()
32 | 
33 |             search_input = self.driver.find_element_by_name('p_s_ALL')
34 |             submit_button = self.driver.find_element_by_name('a_search')
35 | 
36 |             search_input.clear()
37 |             search_input.send_keys('44d[ob] and %s[on]' % company)
38 |             submit_button.click()
39 | 
40 |             more = True
41 | 
42 |             while more:
43 |                 links = self.driver.find_elements_by_link_text('TSDR')
44 | 
45 |                 for link in links:
46 |                     href = link.get_attribute('href')
47 |                     serial = href.split('caseNumber=')[1].split('&')[0]
48 |                     url = 'http://tsdr.uspto.gov/statusview/sn%s' % serial
49 | 
50 |                     callback = scrapy.Request(url, callback=self.parse_tsdr)
51 |                     callback.meta['company'] = company
52 |                     callback.meta['url'] = href
53 | 
54 |                     yield callback
55 | 
56 |                 try:
57 |                     link = self.driver.find_element_by_xpath("//img[@src='/webaka/icon/reg/list_n.gif']/..")
58 |                     link.click()
59 | 
60 |                     more = True
61 |                 except NoSuchElementException:
62 |                     more = False
63 | 
64 |     def parse_tsdr(self, response):
65 |         data = {
66 |             'company': response.meta['company'],
67 |             'url': response.meta['url']
68 |         }
69 | 
70 |         self.log(response.url)
71 | 
72 |         data['application_date'] = response.xpath('//*[@id="summary"]/div[3]/div/div[4]/text()').extract_first().strip().replace('.', '')
73 |         data['mark'] = response.xpath('//*[@id="summary"]/div[2]/div/div[2]/text()').extract_first().strip()
74 |         data['owner_name'] = response.xpath('//*[@id="relatedProp-section"]/div[1]/div/div[2]/text()').extract_first().strip()
75 |         data['owner_address'] = ' '.join(map(str.strip, response.xpath('//*[@id="relatedProp-section"]/div[2]/div/div[2]/div/text()').extract())).replace('\r\n', ' ')
76 | 
77 |         foreign_fields = len(response.xpath('//*[@id="markInfo-section"]/div[2]/div'))
78 | 
79 |         #
80 |         if foreign_fields == 3:
81 |             foreign_country = response.xpath('//*[@id="markInfo-section"]/div[2]/div[3]/div[2]/text()').extract_first()
82 |         else:
83 |             foreign_country = response.xpath('//*[@id="markInfo-section"]/div[2]/div[2]/div[2]/text()').extract_first()
84 | 
85 |         if foreign_country is not None:
86 |             data['foreign_country'] = foreign_country.strip()
87 | 
88 |         foreign_date = response.xpath('//*[@id="markInfo-section"]/div[2]/div[1]/div[4]/text()').extract_first()
89 | 
90 |         if foreign_date is not None:
91 |             data['foreign_date'] = foreign_date.strip().replace('.', '')
92 | 
93 |         yield data
94 | 


--------------------------------------------------------------------------------