├── isna ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_isna.sh ├── tarjoman ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_tarjoman.sh ├── virgool ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_virgool.sh ├── asriran ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_asriran.sh ├── ensani ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_ensani.sh ├── tasnim ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_tasnim.sh ├── wikipedia ├── kaggle │ └── dataset-metadata.json ├── update_kaggle.sh └── run_wikipedia.sh ├── requirements.txt ├── .vscode └── launch.json ├── logger.py ├── LICENSE.md ├── README.md ├── tarjoman.py ├── ensani.py ├── asriran.py ├── tasnim.py ├── virgool.py ├── wikipedia.py ├── .gitignore └── isna.py /isna/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/isna-news" 3 | } -------------------------------------------------------------------------------- /isna/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p isna/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /tarjoman/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/tarjoman" 3 | } -------------------------------------------------------------------------------- /virgool/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/virgool" 3 | } -------------------------------------------------------------------------------- /asriran/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/asriran-news" 3 | } -------------------------------------------------------------------------------- /ensani/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/ensani-abstracts" 3 | } -------------------------------------------------------------------------------- /ensani/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p ensani/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /tasnim/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/tasnimdataset" 3 | } -------------------------------------------------------------------------------- /tasnim/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p tasnim/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /wikipedia/kaggle/dataset-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "amirpourmand/fa-wikipedia" 3 | } -------------------------------------------------------------------------------- /asriran/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p asriran/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /tarjoman/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p tarjoman/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /virgool/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p virgool/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /wikipedia/update_kaggle.sh: -------------------------------------------------------------------------------- 1 | kaggle datasets version -p wikipedia/kaggle/ -m "add newer version" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | rich==12.5.1 2 | Scrapy==2.6.1 3 | pyopenssl==22.0.0 4 | cryptography==38.0.4 5 | trafilatura==1.6.1 -------------------------------------------------------------------------------- /ensani/run_ensani.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you wish to remove this run and run crawler again? " yn 3 | case $yn in 4 | [Yy]* ) rm ensani/kaggle/ensani.csv 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 ensani.py -o ensani/kaggle/ensani.csv ; break;; 6 | [Nn]* ) exit;; 7 | * ) echo "Please answer yes or no.";; 8 | esac 9 | done 10 | 11 | -------------------------------------------------------------------------------- /tasnim/run_tasnim.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you wish to remove this run and run crawler again? " yn 3 | case $yn in 4 | [Yy]* ) rm tasnim/kaggle/tasnim.csv 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 tasnim.py -o tasnim/kaggle/tasnim.csv ; break;; 6 | [Nn]* ) exit;; 7 | * ) echo "Please answer yes or no.";; 8 | esac 9 | done 10 | 11 | -------------------------------------------------------------------------------- /isna/run_isna.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you wish to remove this run and run crawler again? " yn 3 | case $yn in 4 | [Yy]* ) rm isna/kaggle/isna.csv 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 isna.py -o isna/kaggle/isna.csv -a from_year=1378 -a to_year=1401 ; break;; 6 | [Nn]* ) exit;; 7 | * ) echo "Please answer yes or no.";; 8 | esac 9 | done 10 | 11 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.1.0", 3 | "configurations": [ 4 | { 5 | "name": "Python: Launch Scrapy Spider", 6 | "type": "python", 7 | "request": "launch", 8 | "module": "scrapy", 9 | "args": [ 10 | "runspider", 11 | "${file}" 12 | ], 13 | "console": "integratedTerminal" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /tarjoman/run_tarjoman.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | echo "Please put your extracted urls into tarjoman/index.txt" 3 | read -p "Do you wish to remove this run and run crawler again? " yn 4 | case $yn in 5 | [Yy]* ) rm tarjoman/kaggle/tarjoman.csv 6 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 tarjoman.py -o tarjoman/kaggle/tarjoman.csv ; break;; 7 | [Nn]* ) exit;; 8 | * ) echo "Please answer yes or no.";; 9 | esac 10 | done 11 | 12 | -------------------------------------------------------------------------------- /asriran/run_asriran.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you wish to remove this run and run crawler again? " yn 3 | case $yn in 4 | [Yy]* ) rm asriran/kaggle/asriran.csv 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 asriran.py -o asriran/kaggle/asriran.csv \ 6 | -a from_page=1 -a to_page=8313 \ 7 | -a from_date='1384/01/01' -a to_date='1401/04/24'; break;; 8 | [Nn]* ) exit;; 9 | * ) echo "Please answer yes or no.";; 10 | esac 11 | done -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from rich.logging import RichHandler 4 | 5 | logger = logging.getLogger(__name__) 6 | # the handler determines where the logs go: stdout/file 7 | shell_handler = RichHandler() 8 | file_handler = logging.FileHandler("debug.log") 9 | 10 | logger.setLevel(logging.DEBUG) 11 | shell_handler.setLevel(logging.DEBUG) 12 | file_handler.setLevel(logging.DEBUG) 13 | 14 | # the formatter determines what our logs will look like 15 | 16 | fmt_shell = "%(message)s" 17 | fmt_file = ( 18 | "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d] %(message)s" 19 | ) 20 | 21 | shell_formatter = logging.Formatter(fmt_shell) 22 | file_formatter = logging.Formatter(fmt_file) 23 | # here we hook everything together 24 | shell_handler.setFormatter(shell_formatter) 25 | file_handler.setFormatter(file_formatter) 26 | 27 | logger.addHandler(shell_handler) 28 | logger.addHandler(file_handler) -------------------------------------------------------------------------------- /virgool/run_virgool.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you want to build index.txt file? (This is required for the first time. Crawler uses index.txt as starting point) " yn 3 | case $yn in 4 | [Yy]* ) rm virgool/index.txt 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 virgool.py -o virgool/kaggle/virgool.csv -a gather_index_pages=True; exit;; 6 | [Nn]* ) echo "You chose not to build index.txt file. Going forwrd to crawl the virgool"; break;; 7 | * ) echo "Please answer yes or no.";; 8 | esac 9 | done 10 | 11 | while true; do 12 | read -p "Do you wish to crawl from scratch? " yn 13 | case $yn in 14 | [Yy]* ) rm virgool/kaggle/virgool.csv 15 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 virgool.py -o virgool/kaggle/virgool.csv; break;; 16 | [Nn]* ) exit;; 17 | * ) echo "Please answer yes or no.";; 18 | esac 19 | done 20 | 21 | -------------------------------------------------------------------------------- /wikipedia/run_wikipedia.sh: -------------------------------------------------------------------------------- 1 | while true; do 2 | read -p "Do you want to build index.txt file? (This is required for the first time. Crawler uses index.txt as starting point) " yn 3 | case $yn in 4 | [Yy]* ) rm wikipedia/index.txt 5 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 wikipedia.py -o wikipedia/kaggle/wikipedia.csv -a gather_index_pages=True; exit;; 6 | [Nn]* ) echo "You chose not to build index.txt file. Going forwrd to crawl the wikipedia"; break;; 7 | * ) echo "Please answer yes or no.";; 8 | esac 9 | done 10 | 11 | while true; do 12 | read -p "Do you wish to crawl from scratch? " yn 13 | case $yn in 14 | [Yy]* ) rm wikipedia/kaggle/wikipedia.csv 15 | scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 wikipedia.py -o wikipedia/kaggle/wikipedia.csv; break;; 16 | [Nn]* ) exit;; 17 | * ) echo "Please answer yes or no.";; 18 | esac 19 | done 20 | 21 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Amir Pourmand 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/amirpourmand/datasets) 2 | 3 | # Crawler 4 | Open source crawler for Persian websites. Crawled websites to now: 5 | - [Asriran](https://www.kaggle.com/datasets/amirpourmand/asriran-news) 6 | - [Fa-Wikipedia](https://www.kaggle.com/datasets/amirpourmand/fa-wikipedia) 7 | - [Tasnim](https://www.kaggle.com/datasets/amirpourmand/tasnimdataset) 8 | - [Isna](https://www.kaggle.com/datasets/amirpourmand/isna-news) 9 | 10 | ### Asriran 11 | 12 | ```bash 13 | asriran/run_asriran.sh 14 | ``` 15 | 16 | > You can change some paramters in this crawler. See `run_asriran.sh`. 17 | 18 | ### Fa-Wikipedia 19 | 20 | Due to some problems in crawling, I splitted this job into two stages. First crawling all index pages and second use those pages for crawling. 21 | ```bash 22 | wikipedia/run_wikipedia.sh 23 | ``` 24 | 25 | ### Tasnim News 26 | This crawler saves [tasnim news](https://www.tasnimnews.com/) pages based on category. This is appopriate for text classification task as data is relatively balanced across all categories. I selected equal amount of page per category. 27 | 28 | > We have a parameter Called `Number_of_pages` in `tasnim.py` which controls how many pages we should crawl in each category. 29 | 30 | ```bash 31 | tasnim/run_tasnim.sh 32 | ``` 33 | 34 | Datasets are all available for download at [Kaggle](https://www.kaggle.com/amirpourmand/datasets). 35 | 36 | CSS selectors are mostly extracted via [Copy Css Selector](https://chrome.google.com/webstore/detail/copy-css-selector/kemkenbgbgodoglfkkejbdcpojnodnkg?hl=en). 37 | 38 | 39 | 40 | 41 | - https://stackoverflow.com/questions/73859249/attributeerror-module-openssl-ssl-has-no-attribute-sslv3-method 42 | - https://stackoverflow.com/a/73867925/4201765 43 | -------------------------------------------------------------------------------- /tarjoman.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from pathlib import Path 3 | from logger import logger 4 | import json 5 | from trafilatura import extract 6 | 7 | class TarjomanSpider(scrapy.Spider): 8 | name = "Tarjoman" 9 | 10 | custom_settings = {'AUTOTHROTTLE_ENABLED': True, 11 | 'HTTPCACHE_ENABLED': True, 12 | 'CONCURRENT_REQUESTS': 100, 13 | 'CONCURRENT_REQUESTS_PER_DOMAIN': 100, 14 | } 15 | 16 | def __init__(self, **kwargs): 17 | super().__init__(**kwargs) 18 | self.start_urls = [] 19 | if Path('tarjoman/index.txt').exists(): 20 | text = Path('tarjoman/index.txt').read_text(encoding='utf-8') 21 | # https://github.com/pourmand1376/PersianCrawler/issues/6 22 | items = json.loads(text) 23 | self.start_urls = items 24 | 25 | logger.info(f'{len(self.start_urls)} urls fetched.') 26 | 27 | def parse(self, response, **kwargs): 28 | try: 29 | item = {'title': response.css('div h1::text').get(), 30 | 'text': extract(response.body.decode('utf-8'),deduplicate=True, include_images=False, 31 | include_comments=False, include_links=False), 32 | 'url': response.url, 33 | } 34 | if item['title'] is None or len(item['title']) == 0: 35 | return scrapy.Request(url=response.url, callback=self.parse) 36 | return item 37 | except Exception: 38 | logger.error("Parsing Error: ", exc_info=True) 39 | 40 | def handle_error(self, failure): 41 | logger.warning("Error,", failure.request.url) 42 | yield scrapy.Request( 43 | url=failure.request.url, 44 | dont_filter=True, 45 | callback=self.parse, 46 | errback=self.handle_error) -------------------------------------------------------------------------------- /ensani.py: -------------------------------------------------------------------------------- 1 | from logger import logger 2 | import scrapy 3 | import logging 4 | from scrapy.utils.log import configure_logging 5 | 6 | 7 | class EnsaniSpider(scrapy.Spider): 8 | name= "Ensani" 9 | 10 | start_urls = [] 11 | custom_settings = {'AUTOTHROTTLE_ENABLED':True, 12 | 'HTTPCACHE_ENABLED':True, 13 | 'CONCURRENT_REQUESTS':30, 14 | 'CONCURRENT_REQUESTS_PER_DOMAIN':30, 15 | } 16 | 17 | configure_logging(install_root_handler=False) 18 | logging.basicConfig( 19 | filename='log.txt', 20 | format='%(levelname)s: %(message)s', 21 | level=logging.INFO 22 | ) 23 | 24 | def __init__(self, **kwargs): 25 | # همه صفحه ها 26 | super().__init__(**kwargs) 27 | try: 28 | self.start_urls = [] 29 | end_page = 21141 30 | for i in range(1,end_page+1): 31 | url = f"https://ensani.ir/fa/article/field/3363?ArticleSearch%5BpageSize%5D=20&ArticleSearch%5BscientificRank%5D=&ArticleSearch%5BjournalId%5D=&ArticleSearch%5Byear%5D=&ArticleSearch%5Blanguage%5D=1&ArticleSearch%5Btitle%5D=&ArticleSearch%5BsortBy%5D=&page={i}" 32 | self.start_urls.append(url) 33 | logger.info('urls are appended') 34 | 35 | except Exception: 36 | logger.error('error', exc_info=True) 37 | 38 | def parse(self, response, **kwargs): 39 | try: 40 | abstract_array=response.css('div.well.collapse *::text') 41 | for item in abstract_array: 42 | output = {'text': item.get()} 43 | yield output 44 | except Exception: 45 | logger.error("Parsing Error: ", exc_info=True) 46 | 47 | def handle_error(self, failure): 48 | logger.warning("Error,", failure.request.url) 49 | yield scrapy.Request( 50 | url=failure.request.url, 51 | dont_filter=True, 52 | callback=self.parse, 53 | errback=self.handle_error) -------------------------------------------------------------------------------- /asriran.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | from logger import logger 4 | 5 | class AsriranSpider(scrapy.Spider): 6 | name= "Asriran" 7 | number_of_pages = 8313 8 | 9 | main_url = "https://www.asriran.com" 10 | 11 | def __init__(self, from_page, to_page,from_date, to_date): 12 | self.start_urls = [] 13 | 14 | from_page = int(from_page) 15 | to_page = int(to_page) 16 | 17 | for i in range(from_page, to_page): 18 | self.start_urls.append( 19 | f"https://www.asriran.com/fa/archive?rpp=100&p={i}&from_date={from_date}&to_date={to_date}" 20 | ) 21 | 22 | logger.info('urls are appended') 23 | 24 | 25 | def parse(self, response): 26 | try: 27 | for news in response.css("body#archive div.inner-content a::attr(href)").getall(): 28 | yield scrapy.Request( 29 | self.main_url+news, 30 | callback=self.parse_news, 31 | ) 32 | except Exception: 33 | logger.error("Parsing Error: ", exc_info=True) 34 | 35 | def parse_news(self, response): 36 | try: 37 | item = { 38 | 'title': response.css('body#news div.title > h1 > a::text').get().strip(), 39 | 'shortlink': self.main_url+response.css('body#news div.short-link.row > a::attr(href)').get(), 40 | 'time': response.css('body#news div.news_nav.news_pdate_c.iconMobileN::text').getall()[1].strip(), 41 | 'service': response.css('body#news div:nth-child(5) > div:nth-child(2) > div:nth-child(1) > div > a:nth-child(1)::text').get(0), 42 | 'subgroup': response.css('body#news div:nth-child(5) > div:nth-child(2) > div:nth-child(1) > div > a:nth-child(2)::text').get(0), 43 | 'abstract': ' '.join(response.css('body#news div.subtitle::text').getall()), 44 | 'body': ' '.join(response.css('body#news div.body > *::text').getall()), 45 | } 46 | 47 | for key in item: 48 | item[key] = re.sub(' +', ' ', item[key]).strip() 49 | 50 | yield item 51 | 52 | except Exception: 53 | logger.error("Error", exc_info=True) 54 | 55 | -------------------------------------------------------------------------------- /tasnim.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class TasnimSpider(scrapy.Spider): 4 | 5 | main_url = "https://www.tasnimnews.com" 6 | name = "Tasnim" 7 | base_urls=['https://www.tasnimnews.com/fa/service/1/', #سیاسی, 8 | 'https://www.tasnimnews.com/fa/service/2/', #اجتماعی, 9 | 'https://www.tasnimnews.com/fa/service/3/', #ورزشی, 10 | 'https://www.tasnimnews.com/fa/service/4/', #فرهنگی هنری, 11 | 'https://www.tasnimnews.com/fa/service/6/', #استان‌ها, 12 | 'https://www.tasnimnews.com/fa/service/7/', #اقتصادی, 13 | 'https://www.tasnimnews.com/fa/service/8/', #بین الملل, 14 | 'https://www.tasnimnews.com/fa/service/9/', #رسانه ها, 15 | ] 16 | 17 | number_of_pages=400 18 | def __init__(self): 19 | pages = [f"?page={i}" for i in range(1,self.number_of_pages)] 20 | self.start_urls = [] 21 | for item in self.base_urls: 22 | for page in pages: 23 | self.start_urls.append(f"{item}{page}") 24 | 25 | def parse(self, response ): 26 | categories = {'https://www.tasnimnews.com/fa/service/1/':'سیاسی', 27 | 'https://www.tasnimnews.com/fa/service/2/':'اجتماعی', 28 | 'https://www.tasnimnews.com/fa/service/3/':'ورزشی', 29 | 'https://www.tasnimnews.com/fa/service/4/':'فرهنگی هنری', 30 | 'https://www.tasnimnews.com/fa/service/6/':'استان‌ها', 31 | 'https://www.tasnimnews.com/fa/service/7/':'اقتصادی', 32 | 'https://www.tasnimnews.com/fa/service/8/':'بین الملل', 33 | 'https://www.tasnimnews.com/fa/service/9/':'رسانه ها',} 34 | 35 | for item in categories.keys(): 36 | if response.url.startswith(item): 37 | category = categories[item] 38 | 39 | for news in response.css('article.list-item a::attr(href)').getall(): 40 | request=scrapy.Request( 41 | self.main_url+news, 42 | callback=self.parse_news, 43 | cb_kwargs=dict(category=category)) 44 | yield request 45 | 46 | 47 | def parse_news(self, response,category): 48 | item = { 49 | 'category': category, 50 | 'title': response.css('article.single-news h1.title::text').get(), 51 | 'abstract': response.css('article.single-news h3.lead::text').get(), 52 | 'body': ' '.join(response.css('article.single-news div.story p::text').getall()), 53 | 'time': response.css('article.single-news div._sticky ul.list-inline li.time::text').get() 54 | } 55 | 56 | yield item -------------------------------------------------------------------------------- /virgool.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from pathlib import Path 3 | from logger import logger 4 | import re 5 | from trafilatura import extract 6 | def get_cookie(response): 7 | content = response.body.decode('utf-8') 8 | cookie_regex = "document\.cookie\s*=\s*\\'(.+); Max-Age" 9 | 10 | match = re.search(cookie_regex, content) 11 | if match: 12 | return match.group(1) 13 | return None 14 | 15 | 16 | class VirgoolSpider(scrapy.Spider): 17 | name = "Virgool" 18 | number_of_pages = 30000 19 | 20 | custom_settings = {'AUTOTHROTTLE_ENABLED': True, 21 | 'HTTPCACHE_ENABLED': True, 22 | 'CONCURRENT_REQUESTS': 100, 23 | 'CONCURRENT_REQUESTS_PER_DOMAIN': 100, 24 | } 25 | def __init__(self, gather_index_pages=False, **kwargs): 26 | super().__init__(**kwargs) 27 | self.start_urls = [] 28 | self.gather_index_pages = gather_index_pages 29 | if not self.gather_index_pages: 30 | self.start_urls = Path('virgool/index.txt').read_text().split('\n') 31 | else: 32 | for i in range(0, self.number_of_pages + 1): 33 | self.start_urls.append( 34 | f"https://virgool.io/?page={i}" 35 | ) 36 | 37 | logger.info('urls are appended') 38 | 39 | def parse(self, response, **kwargs): 40 | try: 41 | if self.gather_index_pages: 42 | for item in range(1, 20): 43 | url = response.css(f'main#app article:nth-child({item}) > div > a::attr(href)').get() 44 | if url: 45 | with Path('virgool/index.txt').open("a") as f: 46 | f.write(url + '\n') 47 | else: 48 | item = {'title': response.css('main#app h1::text').get(), 49 | 'author': response.css('main#app div.module-header > a::text').get(), 50 | 'text': extract(response.body.decode('utf-8'),deduplicate=True, include_images=False, include_comments=False, include_links=False), 51 | 'url': response.css('.shorturl-text::text').get() 52 | } 53 | 54 | # if I use strip on all of them I may get error. I have to check if it is not none. 55 | for key in item: 56 | if item[key]: 57 | item[key] = re.sub(' +', ' ', item[key]).strip() 58 | 59 | return item 60 | except Exception: 61 | logger.error("Parsing Error: ", exc_info=True) 62 | 63 | def handle_error(self, failure): 64 | logger.warning("Error,", failure.request.url) 65 | yield scrapy.Request( 66 | url=failure.request.url, 67 | dont_filter=True, 68 | callback=self.parse, 69 | errback=self.handle_error) 70 | -------------------------------------------------------------------------------- /wikipedia.py: -------------------------------------------------------------------------------- 1 | from asyncio import gather 2 | import scrapy 3 | import re 4 | from logger import logger 5 | 6 | import logging 7 | from scrapy.utils.log import configure_logging 8 | from pathlib import Path 9 | 10 | class WikipediaSpider(scrapy.Spider): 11 | name= "Wikipedia" 12 | 13 | main_url = "https://fa.wikipedia.org/" 14 | 15 | custom_settings = {'AUTOTHROTTLE_ENABLED':True, 16 | 'HTTPCACHE_ENABLED':True, 17 | 'CONCURRENT_REQUESTS':30, 18 | 'CONCURRENT_REQUESTS_PER_DOMAIN':30, 19 | } 20 | 21 | 22 | configure_logging(install_root_handler=False) 23 | logging.basicConfig( 24 | filename='log.txt', 25 | format='%(levelname)s: %(message)s', 26 | level=logging.INFO 27 | ) 28 | 29 | def __init__(self, gather_index_pages=False): 30 | # همه صفحه ها 31 | # صفحه ایندکس ویکی پدیای فارسی 32 | try: 33 | self.start_urls = ["https://fa.wikipedia.org/w/index.php?title=%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D9%85%D8%A7%D9%85_%D8%B5%D9%81%D8%AD%D9%87%E2%80%8C%D9%87%D8%A7"] 34 | self.gather_index_pages = gather_index_pages 35 | if not self.gather_index_pages: 36 | self.start_urls=Path('wikipedia/index.txt').read_text().split('\n') 37 | except Exception: 38 | logger.error('error', exc_info=True) 39 | 40 | def parse(self, response): 41 | try: 42 | next_page = response.css('div#mw-content-text > div:nth-child(2)> :contains("صفحهٔ بعد")::attr(href)').getall() 43 | if self.gather_index_pages and next_page: 44 | logger.info(f"Next page {next_page}") 45 | 46 | with Path('wikipedia/index.txt').open("a") as f: 47 | f.write(self.main_url+next_page[0]+'\n') 48 | 49 | yield scrapy.Request( 50 | self.main_url+next_page[0], 51 | callback=self.parse, 52 | dont_filter=True, 53 | errback=self.handle_failure, 54 | ) 55 | 56 | if not self.gather_index_pages: 57 | for article in response.css('div#mw-content-text > div.mw-allpages-body a::attr(href)').getall(): 58 | yield scrapy.Request( 59 | self.main_url+article, 60 | callback=self.parse_news, 61 | ) 62 | except Exception: 63 | logger.error("Parsing Error: ", exc_info=True) 64 | 65 | def handle_failure(self, failure): 66 | logger.warning("Error,", failure.request.url) 67 | yield scrapy.Request( 68 | url=failure.request.url, 69 | dont_filter=True, 70 | callback=self.parse, 71 | errback=self.handle_failure) 72 | 73 | def parse_news(self, response): 74 | try: 75 | item = { 76 | 'title': response.css('#firstHeading *::text').get(), 77 | 'content': ' '.join(response.css('div#mw-content-text > div.mw-parser-output > *:not(style):not(table)::text').getall()), 78 | 'link': self.main_url+response.css('li#t-permalink > a::attr(href)').get(), 79 | } 80 | 81 | for key in item: 82 | item[key] = re.sub(' +', ' ', item[key]).strip() 83 | 84 | yield item 85 | 86 | except Exception: 87 | logger.error(f"Error {item}", exc_info=True) 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | *.csv -------------------------------------------------------------------------------- /isna.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | from logger import logger 4 | 5 | 6 | class IsnaSpider(scrapy.Spider): 7 | 8 | name = "Isna" 9 | start_urls = [] 10 | custom_settings = {'AUTOTHROTTLE_ENABLED':True, 11 | #'HTTPCACHE_ENABLED':True, 12 | # enabling http_cache quickly finished the storage! 13 | #'CONCURRENT_REQUESTS':1000, 14 | #'CONCURRENT_REQUESTS_PER_DOMAIN':1000, 15 | } 16 | 17 | 18 | main_url = "https://www.isna.ir" 19 | 20 | def __init__(self ,from_year = 1378, to_year = 1401): 21 | # Set the range of values for the mn, dy, and yr parameters 22 | mn_range = range(1,32) 23 | dy_range = range(1,13) 24 | yr_range = range(int(from_year),int(to_year)+1) 25 | pi_range= range(1,101) 26 | # Create an empty list to store the start URLs 27 | self.start_urls = [] 28 | 29 | # Loop through the possible values for each parameter 30 | for mn in mn_range: 31 | for dy in dy_range: 32 | for yr in yr_range: 33 | for pi in pi_range: 34 | # Construct the URL using string formatting 35 | url = f"https://www.isna.ir/archive?pi={pi}&ms=0&dy={dy}&mn={mn}&yr={yr}" 36 | # Add the URL to the start_urls list 37 | self.start_urls.append(url) 38 | 39 | # Print the start URLs 40 | logger.info('urls are appended') 41 | 42 | def handle_failure(self, failure): 43 | logger.warning("Error,", failure.request.url) 44 | yield scrapy.Request( 45 | url=failure.request.url, 46 | dont_filter=True, 47 | callback=self.parse, 48 | errback=self.handle_failure) 49 | 50 | 51 | def parse(self, response ): 52 | try: 53 | # we have a problem in isna which is unsolved. If you go to page 50, for example, it might show you some news from previous days! 54 | # so acutally you should filter that out. 55 | for news in response.css("div.items a::attr(href)").getall(): 56 | if len(news.strip()) > 0: 57 | yield scrapy.Request( 58 | self.main_url+news, 59 | callback=self.parse_news, 60 | ) 61 | logger.info('added '+ self.main_url+news) 62 | except Exception: 63 | logger.error("Parsing Error: ", exc_info=True) 64 | 65 | def parse_news(self, response): 66 | try: 67 | # I used pip install scrapy and scrapy shell to help me generate this content. 68 | # scrapy shell 69 | # fetch(url) 70 | # then use reponse.css 71 | # also Copy CSS Selector was useful 72 | item = { 73 | 'title': response.css('article#item h1::text').get(), 74 | 'shortlink': response.css('input#short-url::attr(value)').get(), 75 | 'time': response.css('article#item li:nth-child(1) > span.text-meta::text').get(), 76 | 'service': response.css('article#item li:nth-child(2) > span.text-meta::text').get(), 77 | 'news_id': response.css('article#item li:nth-child(3) > span.text-meta::text').get(), 78 | 'reporter': response.css('article#item li:nth-child(1) > strong::text').get(), 79 | 'managers': response.css('article#item li:nth-child(2) > strong::text').get(), 80 | 'body': ' '.join(response.css('article#item div.item-body *::text').getall()), 81 | } 82 | 83 | # if I use strip on all of them I may get error. I have to check if it is not none. 84 | for key in item: 85 | if item[key]: 86 | item[key] = re.sub(' +', ' ', item[key]).strip() 87 | 88 | yield item 89 | 90 | except Exception: 91 | logger.error("Error", exc_info=True) 92 | --------------------------------------------------------------------------------