├── trademarks ├── __init__.py ├── spiders │ ├── __init__.py │ └── foreign_spider.py ├── pipelines.py ├── items.py └── settings.py ├── requirements.txt ├── .gitignore ├── README.md └── scrapy.cfg /trademarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=1.2.0 2 | -------------------------------------------------------------------------------- /trademarks/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.swp 4 | *.swo 5 | .tox 6 | *.egg-info 7 | docs/_build 8 | dist 9 | .coverage 10 | build 11 | .proof 12 | .ipynb_checkpoints 13 | .idea 14 | __pycache__ 15 | .scrapy 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # trademarks 2 | 3 | Scraper for tech company trademarks filed in foreign nations 4 | 5 | ## Setup 6 | 7 | ``` 8 | mkvirtualenv trademarks 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | ## Running 13 | 14 | ``` 15 | rm foreign.csv && scrapy crawl foreign -o foreign.csv 16 | ``` 17 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = trademarks.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = trademarks 12 | -------------------------------------------------------------------------------- /trademarks/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TrademarksPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /trademarks/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TrademarksItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /trademarks/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for trademarks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'trademarks' 13 | 14 | SPIDER_MODULES = ['trademarks.spiders'] 15 | NEWSPIDER_MODULE = 'trademarks.spiders' 16 | 17 | FEED_EXPORT_FIELDS = ['company', 'owner_name', 'mark', 'application_date', 'foreign_country', 'foreign_date', 'url', 'owner_address'] 18 | 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'trademarks (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | CONCURRENT_REQUESTS = 1 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | #DOWNLOAD_DELAY = 3 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'trademarks.middlewares.MyCustomSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'trademarks.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | #ITEM_PIPELINES = { 70 | # 'trademarks.pipelines.SomePipeline': 300, 71 | #} 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | HTTPCACHE_ENABLED = True 89 | HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /trademarks/spiders/foreign_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import scrapy 4 | from selenium import webdriver 5 | from selenium.common.exceptions import NoSuchElementException 6 | 7 | class ForeignSpider(scrapy.Spider): 8 | name = "foreign" 9 | 10 | companies = [ 11 | 'amazon', 12 | 'apple', 13 | 'google', 14 | 'microsoft', 15 | 'alphabet', 16 | 'ibm', 17 | 'intel' 18 | ] 19 | 20 | start_urls = ['http://tess2.uspto.gov/'] 21 | 22 | def __init__(self): 23 | self.driver = webdriver.Firefox() 24 | 25 | def parse(self, response): 26 | for company in self.companies: 27 | self.driver.get(response.url) 28 | 29 | search_link = self.driver.find_element_by_link_text('Word and/or Design Mark Search (Free Form)') 30 | 31 | search_link.click() 32 | 33 | search_input = self.driver.find_element_by_name('p_s_ALL') 34 | submit_button = self.driver.find_element_by_name('a_search') 35 | 36 | search_input.clear() 37 | search_input.send_keys('44d[ob] and %s[on]' % company) 38 | submit_button.click() 39 | 40 | more = True 41 | 42 | while more: 43 | links = self.driver.find_elements_by_link_text('TSDR') 44 | 45 | for link in links: 46 | href = link.get_attribute('href') 47 | serial = href.split('caseNumber=')[1].split('&')[0] 48 | url = 'http://tsdr.uspto.gov/statusview/sn%s' % serial 49 | 50 | callback = scrapy.Request(url, callback=self.parse_tsdr) 51 | callback.meta['company'] = company 52 | callback.meta['url'] = href 53 | 54 | yield callback 55 | 56 | try: 57 | link = self.driver.find_element_by_xpath("//img[@src='/webaka/icon/reg/list_n.gif']/..") 58 | link.click() 59 | 60 | more = True 61 | except NoSuchElementException: 62 | more = False 63 | 64 | def parse_tsdr(self, response): 65 | data = { 66 | 'company': response.meta['company'], 67 | 'url': response.meta['url'] 68 | } 69 | 70 | self.log(response.url) 71 | 72 | data['application_date'] = response.xpath('//*[@id="summary"]/div[3]/div/div[4]/text()').extract_first().strip().replace('.', '') 73 | data['mark'] = response.xpath('//*[@id="summary"]/div[2]/div/div[2]/text()').extract_first().strip() 74 | data['owner_name'] = response.xpath('//*[@id="relatedProp-section"]/div[1]/div/div[2]/text()').extract_first().strip() 75 | data['owner_address'] = ' '.join(map(str.strip, response.xpath('//*[@id="relatedProp-section"]/div[2]/div/div[2]/div/text()').extract())).replace('\r\n', ' ') 76 | 77 | foreign_fields = len(response.xpath('//*[@id="markInfo-section"]/div[2]/div')) 78 | 79 | # 80 | if foreign_fields == 3: 81 | foreign_country = response.xpath('//*[@id="markInfo-section"]/div[2]/div[3]/div[2]/text()').extract_first() 82 | else: 83 | foreign_country = response.xpath('//*[@id="markInfo-section"]/div[2]/div[2]/div[2]/text()').extract_first() 84 | 85 | if foreign_country is not None: 86 | data['foreign_country'] = foreign_country.strip() 87 | 88 | foreign_date = response.xpath('//*[@id="markInfo-section"]/div[2]/div[1]/div[4]/text()').extract_first() 89 | 90 | if foreign_date is not None: 91 | data['foreign_date'] = foreign_date.strip().replace('.', '') 92 | 93 | yield data 94 | --------------------------------------------------------------------------------