├── isna
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_isna.sh
├── tarjoman
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_tarjoman.sh
├── virgool
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_virgool.sh
├── asriran
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_asriran.sh
├── ensani
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_ensani.sh
├── tasnim
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_tasnim.sh
├── wikipedia
    ├── kaggle
    │   └── dataset-metadata.json
    ├── update_kaggle.sh
    └── run_wikipedia.sh
├── requirements.txt
├── .vscode
    └── launch.json
├── logger.py
├── LICENSE.md
├── README.md
├── tarjoman.py
├── ensani.py
├── asriran.py
├── tasnim.py
├── virgool.py
├── wikipedia.py
├── .gitignore
└── isna.py


/isna/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/isna-news"
3 | }


--------------------------------------------------------------------------------
/isna/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p isna/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/tarjoman/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/tarjoman"
3 | }


--------------------------------------------------------------------------------
/virgool/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/virgool"
3 | }


--------------------------------------------------------------------------------
/asriran/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/asriran-news"
3 | }


--------------------------------------------------------------------------------
/ensani/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/ensani-abstracts"
3 | }


--------------------------------------------------------------------------------
/ensani/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p ensani/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/tasnim/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/tasnimdataset"
3 | }


--------------------------------------------------------------------------------
/tasnim/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p tasnim/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/wikipedia/kaggle/dataset-metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "id": "amirpourmand/fa-wikipedia"
3 | }


--------------------------------------------------------------------------------
/asriran/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p asriran/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/tarjoman/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p tarjoman/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/virgool/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p virgool/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/wikipedia/update_kaggle.sh:
--------------------------------------------------------------------------------
1 | kaggle datasets version -p wikipedia/kaggle/ -m "add newer version"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | rich==12.5.1
2 | Scrapy==2.6.1
3 | pyopenssl==22.0.0
4 | cryptography==38.0.4
5 | trafilatura==1.6.1


--------------------------------------------------------------------------------
/ensani/run_ensani.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you wish to remove this run and run crawler again? " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm ensani/kaggle/ensani.csv
 5 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 ensani.py -o ensani/kaggle/ensani.csv ; break;;
 6 |         [Nn]* ) exit;;
 7 |         * ) echo "Please answer yes or no.";;
 8 |     esac
 9 | done
10 | 
11 | 


--------------------------------------------------------------------------------
/tasnim/run_tasnim.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you wish to remove this run and run crawler again? " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm tasnim/kaggle/tasnim.csv
 5 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 tasnim.py -o tasnim/kaggle/tasnim.csv ; break;;
 6 |         [Nn]* ) exit;;
 7 |         * ) echo "Please answer yes or no.";;
 8 |     esac
 9 | done
10 | 
11 | 


--------------------------------------------------------------------------------
/isna/run_isna.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you wish to remove this run and run crawler again? " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm isna/kaggle/isna.csv
 5 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 isna.py -o isna/kaggle/isna.csv  -a from_year=1378 -a to_year=1401 ; break;;
 6 |         [Nn]* ) exit;;
 7 |         * ) echo "Please answer yes or no.";;
 8 |     esac
 9 | done
10 | 
11 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.1.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Python: Launch Scrapy Spider",
 6 |             "type": "python",
 7 |             "request": "launch",
 8 |             "module": "scrapy",
 9 |             "args": [
10 |                 "runspider",
11 |                 "${file}"
12 |             ],
13 |             "console": "integratedTerminal"
14 |         }
15 |     ]
16 | }


--------------------------------------------------------------------------------
/tarjoman/run_tarjoman.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     echo "Please put your extracted urls into tarjoman/index.txt"
 3 |     read -p "Do you wish to remove this run and run crawler again? " yn
 4 |     case $yn in
 5 |         [Yy]* ) rm tarjoman/kaggle/tarjoman.csv
 6 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 tarjoman.py -o tarjoman/kaggle/tarjoman.csv ; break;;
 7 |         [Nn]* ) exit;;
 8 |         * ) echo "Please answer yes or no.";;
 9 |     esac
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/asriran/run_asriran.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you wish to remove this run and run crawler again? " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm asriran/kaggle/asriran.csv
 5 |         scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 asriran.py -o asriran/kaggle/asriran.csv \
 6 |             -a from_page=1 -a to_page=8313 \
 7 |             -a from_date='1384/01/01' -a to_date='1401/04/24'; break;;
 8 |         [Nn]* ) exit;;
 9 |         * ) echo "Please answer yes or no.";;
10 |     esac
11 | done


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from rich.logging import RichHandler
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | # the handler determines where the logs go: stdout/file
 7 | shell_handler = RichHandler()
 8 | file_handler = logging.FileHandler("debug.log")
 9 | 
10 | logger.setLevel(logging.DEBUG)
11 | shell_handler.setLevel(logging.DEBUG)
12 | file_handler.setLevel(logging.DEBUG)
13 | 
14 | # the formatter determines what our logs will look like
15 | 
16 | fmt_shell = "%(message)s"
17 | fmt_file = (
18 |     "%(levelname)s %(asctime)s [%(filename)s:%(funcName)s:%(lineno)d] %(message)s"
19 | )
20 | 
21 | shell_formatter = logging.Formatter(fmt_shell)
22 | file_formatter = logging.Formatter(fmt_file)
23 | # here we hook everything together
24 | shell_handler.setFormatter(shell_formatter)
25 | file_handler.setFormatter(file_formatter)
26 | 
27 | logger.addHandler(shell_handler)
28 | logger.addHandler(file_handler)


--------------------------------------------------------------------------------
/virgool/run_virgool.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you want to build index.txt file? (This is required for the first time. Crawler uses index.txt as starting point) " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm virgool/index.txt
 5 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 virgool.py -o virgool/kaggle/virgool.csv -a gather_index_pages=True; exit;;
 6 |         [Nn]* ) echo "You chose not to build index.txt file. Going forwrd to crawl the virgool"; break;;
 7 |         * ) echo "Please answer yes or no.";;
 8 |     esac
 9 | done
10 | 
11 | while true; do
12 |     read -p "Do you wish to crawl from scratch? " yn
13 |     case $yn in
14 |         [Yy]* ) rm virgool/kaggle/virgool.csv
15 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 virgool.py -o virgool/kaggle/virgool.csv; break;;
16 |         [Nn]* ) exit;;
17 |         * ) echo "Please answer yes or no.";;
18 |     esac
19 | done
20 | 
21 | 


--------------------------------------------------------------------------------
/wikipedia/run_wikipedia.sh:
--------------------------------------------------------------------------------
 1 | while true; do
 2 |     read -p "Do you want to build index.txt file? (This is required for the first time. Crawler uses index.txt as starting point) " yn
 3 |     case $yn in
 4 |         [Yy]* ) rm wikipedia/index.txt
 5 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 wikipedia.py -o wikipedia/kaggle/wikipedia.csv -a gather_index_pages=True; exit;;
 6 |         [Nn]* ) echo "You chose not to build index.txt file. Going forwrd to crawl the wikipedia"; break;;
 7 |         * ) echo "Please answer yes or no.";;
 8 |     esac
 9 | done
10 | 
11 | while true; do
12 |     read -p "Do you wish to crawl from scratch? " yn
13 |     case $yn in
14 |         [Yy]* ) rm wikipedia/kaggle/wikipedia.csv
15 |                 scrapy runspider --set FEED_EXPORT_ENCODING=utf-8 wikipedia.py -o wikipedia/kaggle/wikipedia.csv; break;;
16 |         [Nn]* ) exit;;
17 |         * ) echo "Please answer yes or no.";;
18 |     esac
19 | done
20 | 
21 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Amir Pourmand
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/amirpourmand/datasets)
 2 | 
 3 | # Crawler
 4 | Open source crawler for Persian websites. Crawled websites to now:
 5 | - [Asriran](https://www.kaggle.com/datasets/amirpourmand/asriran-news)
 6 | - [Fa-Wikipedia](https://www.kaggle.com/datasets/amirpourmand/fa-wikipedia)
 7 | - [Tasnim](https://www.kaggle.com/datasets/amirpourmand/tasnimdataset)
 8 | - [Isna](https://www.kaggle.com/datasets/amirpourmand/isna-news)
 9 | 
10 | ### Asriran
11 | 
12 | ```bash
13 | asriran/run_asriran.sh
14 | ```
15 | 
16 | > You can change some paramters in this crawler. See `run_asriran.sh`.
17 | 
18 | ### Fa-Wikipedia
19 | 
20 | Due to some problems in crawling, I splitted this job into two stages. First crawling all index pages and second use those pages for crawling. 
21 | ```bash
22 | wikipedia/run_wikipedia.sh
23 | ```
24 | 
25 | ### Tasnim News
26 | This crawler saves [tasnim news](https://www.tasnimnews.com/) pages based on category. This is appopriate for text classification task as data is relatively balanced across all categories. I selected equal amount of page per category. 
27 | 
28 | > We have a parameter Called `Number_of_pages` in `tasnim.py` which controls how many pages we should crawl in each category. 
29 | 
30 | ```bash
31 | tasnim/run_tasnim.sh
32 | ```
33 | 
34 | Datasets are all available for download at [Kaggle](https://www.kaggle.com/amirpourmand/datasets).
35 | 
36 | CSS selectors are mostly extracted via [Copy Css Selector](https://chrome.google.com/webstore/detail/copy-css-selector/kemkenbgbgodoglfkkejbdcpojnodnkg?hl=en).
37 | 
38 | 
39 | 
40 | 
41 | - https://stackoverflow.com/questions/73859249/attributeerror-module-openssl-ssl-has-no-attribute-sslv3-method
42 | - https://stackoverflow.com/a/73867925/4201765
43 | 


--------------------------------------------------------------------------------
/tarjoman.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from pathlib import Path
 3 | from logger import logger
 4 | import json
 5 | from trafilatura import  extract
 6 | 
 7 | class TarjomanSpider(scrapy.Spider):
 8 |     name = "Tarjoman"
 9 | 
10 |     custom_settings = {'AUTOTHROTTLE_ENABLED': True,
11 |                        'HTTPCACHE_ENABLED': True,
12 |                        'CONCURRENT_REQUESTS': 100,
13 |                        'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
14 |                        }
15 | 
16 |     def __init__(self, **kwargs):
17 |         super().__init__(**kwargs)
18 |         self.start_urls = []
19 |         if Path('tarjoman/index.txt').exists():
20 |             text = Path('tarjoman/index.txt').read_text(encoding='utf-8')
21 |             # https://github.com/pourmand1376/PersianCrawler/issues/6
22 |             items = json.loads(text)
23 |             self.start_urls = items
24 | 
25 |             logger.info(f'{len(self.start_urls)} urls fetched.')
26 | 
27 |     def parse(self, response, **kwargs):
28 |         try:
29 |             item = {'title': response.css('div h1::text').get(),
30 |                     'text': extract(response.body.decode('utf-8'),deduplicate=True, include_images=False,
31 |                                     include_comments=False, include_links=False),
32 |                     'url': response.url,
33 |                     }
34 |             if item['title'] is None or len(item['title']) == 0:
35 |                 return scrapy.Request(url=response.url, callback=self.parse)
36 |             return item
37 |         except Exception:
38 |             logger.error("Parsing Error: ", exc_info=True)
39 | 
40 |     def handle_error(self, failure):
41 |         logger.warning("Error,", failure.request.url)
42 |         yield scrapy.Request(
43 |             url=failure.request.url,
44 |             dont_filter=True,
45 |             callback=self.parse,
46 |             errback=self.handle_error)


--------------------------------------------------------------------------------
/ensani.py:
--------------------------------------------------------------------------------
 1 | from logger import logger
 2 | import scrapy
 3 | import logging
 4 | from scrapy.utils.log import configure_logging
 5 | 
 6 | 
 7 | class EnsaniSpider(scrapy.Spider):
 8 |     name= "Ensani"
 9 | 
10 |     start_urls = []
11 |     custom_settings = {'AUTOTHROTTLE_ENABLED':True,
12 |                         'HTTPCACHE_ENABLED':True,
13 |                         'CONCURRENT_REQUESTS':30,
14 |                         'CONCURRENT_REQUESTS_PER_DOMAIN':30,
15 |                       }
16 | 
17 |     configure_logging(install_root_handler=False)
18 |     logging.basicConfig(
19 |         filename='log.txt',
20 |         format='%(levelname)s: %(message)s',
21 |         level=logging.INFO
22 |     )
23 | 
24 |     def __init__(self, **kwargs):
25 |         # همه صفحه ها
26 |         super().__init__(**kwargs)
27 |         try:
28 |             self.start_urls = []
29 |             end_page = 21141
30 |             for i in range(1,end_page+1):
31 |                 url = f"https://ensani.ir/fa/article/field/3363?ArticleSearch%5BpageSize%5D=20&ArticleSearch%5BscientificRank%5D=&ArticleSearch%5BjournalId%5D=&ArticleSearch%5Byear%5D=&ArticleSearch%5Blanguage%5D=1&ArticleSearch%5Btitle%5D=&ArticleSearch%5BsortBy%5D=&page={i}"
32 |                 self.start_urls.append(url)
33 |             logger.info('urls are appended')
34 | 
35 |         except Exception:
36 |             logger.error('error', exc_info=True)
37 | 
38 |     def parse(self, response, **kwargs):
39 |         try:
40 |             abstract_array=response.css('div.well.collapse *::text')
41 |             for item in abstract_array:
42 |                 output = {'text': item.get()}
43 |                 yield output
44 |         except Exception:
45 |             logger.error("Parsing Error: ", exc_info=True)
46 | 
47 |     def handle_error(self, failure):
48 |         logger.warning("Error,", failure.request.url)
49 |         yield scrapy.Request(
50 |             url=failure.request.url,
51 |             dont_filter=True,
52 |             callback=self.parse,
53 |             errback=self.handle_error)


--------------------------------------------------------------------------------
/asriran.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | from logger import logger
 4 | 
 5 | class AsriranSpider(scrapy.Spider):
 6 |     name= "Asriran"
 7 |     number_of_pages = 8313 
 8 | 
 9 |     main_url = "https://www.asriran.com"
10 | 
11 |     def __init__(self, from_page, to_page,from_date, to_date):
12 |         self.start_urls = []
13 | 
14 |         from_page = int(from_page)
15 |         to_page = int(to_page)
16 | 
17 |         for i in range(from_page, to_page):
18 |             self.start_urls.append(
19 |                 f"https://www.asriran.com/fa/archive?rpp=100&p={i}&from_date={from_date}&to_date={to_date}"
20 |             )
21 |             
22 |         logger.info('urls are appended')
23 |         
24 | 
25 |     def parse(self, response):
26 |         try:
27 |             for news in response.css("body#archive div.inner-content a::attr(href)").getall():
28 |                 yield scrapy.Request(
29 |                     self.main_url+news, 
30 |                     callback=self.parse_news,
31 |                 )
32 |         except Exception:
33 |             logger.error("Parsing Error: ", exc_info=True)
34 | 
35 |     def parse_news(self, response):
36 |         try: 
37 |             item = {
38 |                 'title': response.css('body#news div.title > h1 > a::text').get().strip(),
39 |                 'shortlink': self.main_url+response.css('body#news div.short-link.row > a::attr(href)').get(),
40 |                 'time':  response.css('body#news div.news_nav.news_pdate_c.iconMobileN::text').getall()[1].strip(),
41 |                 'service': response.css('body#news div:nth-child(5) > div:nth-child(2) > div:nth-child(1) > div > a:nth-child(1)::text').get(0),
42 |                 'subgroup': response.css('body#news div:nth-child(5) > div:nth-child(2) > div:nth-child(1) > div > a:nth-child(2)::text').get(0),
43 |                 'abstract': ' '.join(response.css('body#news div.subtitle::text').getall()),
44 |                 'body': ' '.join(response.css('body#news div.body > *::text').getall()),
45 |             }
46 |             
47 |             for key in item:
48 |                 item[key] = re.sub(' +', ' ', item[key]).strip()
49 |             
50 |             yield item
51 | 
52 |         except Exception:
53 |             logger.error("Error", exc_info=True)
54 | 
55 | 


--------------------------------------------------------------------------------
/tasnim.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | class TasnimSpider(scrapy.Spider):
 4 |     
 5 |     main_url = "https://www.tasnimnews.com"
 6 |     name = "Tasnim"
 7 |     base_urls=['https://www.tasnimnews.com/fa/service/1/', #سیاسی,
 8 |                 'https://www.tasnimnews.com/fa/service/2/', #اجتماعی,
 9 |                 'https://www.tasnimnews.com/fa/service/3/', #ورزشی,
10 |                 'https://www.tasnimnews.com/fa/service/4/', #فرهنگی هنری,
11 |                 'https://www.tasnimnews.com/fa/service/6/', #استان‌ها,
12 |                 'https://www.tasnimnews.com/fa/service/7/', #اقتصادی,
13 |                 'https://www.tasnimnews.com/fa/service/8/', #بین الملل,
14 |                 'https://www.tasnimnews.com/fa/service/9/', #رسانه ها,
15 |                  ]
16 | 
17 |     number_of_pages=400
18 |     def __init__(self):
19 |         pages = [f"?page={i}" for i in range(1,self.number_of_pages)]
20 |         self.start_urls = []
21 |         for item in self.base_urls:
22 |             for page in pages:
23 |                 self.start_urls.append(f"{item}{page}")
24 | 
25 |     def parse(self, response ):
26 |         categories = {'https://www.tasnimnews.com/fa/service/1/':'سیاسی',
27 |                     'https://www.tasnimnews.com/fa/service/2/':'اجتماعی',
28 |                     'https://www.tasnimnews.com/fa/service/3/':'ورزشی',
29 |                     'https://www.tasnimnews.com/fa/service/4/':'فرهنگی هنری',
30 |                     'https://www.tasnimnews.com/fa/service/6/':'استان‌ها',
31 |                     'https://www.tasnimnews.com/fa/service/7/':'اقتصادی',
32 |                     'https://www.tasnimnews.com/fa/service/8/':'بین الملل',
33 |                     'https://www.tasnimnews.com/fa/service/9/':'رسانه ها',}
34 |         
35 |         for item in categories.keys():
36 |             if response.url.startswith(item):
37 |                 category = categories[item]
38 | 
39 |         for news in response.css('article.list-item a::attr(href)').getall():
40 |             request=scrapy.Request(
41 |                 self.main_url+news, 
42 |                 callback=self.parse_news,
43 |                 cb_kwargs=dict(category=category))
44 |             yield request 
45 | 
46 | 
47 |     def parse_news(self, response,category):
48 |         item = {
49 |             'category': category,
50 |             'title': response.css('article.single-news h1.title::text').get(),
51 |             'abstract': response.css('article.single-news h3.lead::text').get(),
52 |             'body': ' '.join(response.css('article.single-news div.story p::text').getall()),
53 |             'time': response.css('article.single-news div._sticky ul.list-inline li.time::text').get()
54 |         }
55 |         
56 |         yield item


--------------------------------------------------------------------------------
/virgool.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from pathlib import Path
 3 | from logger import logger
 4 | import re
 5 | from trafilatura import  extract
 6 | def get_cookie(response):
 7 |     content = response.body.decode('utf-8')
 8 |     cookie_regex = "document\.cookie\s*=\s*\\'(.+); Max-Age"
 9 | 
10 |     match = re.search(cookie_regex, content)
11 |     if match:
12 |         return match.group(1)
13 |     return None
14 | 
15 | 
16 | class VirgoolSpider(scrapy.Spider):
17 |     name = "Virgool"
18 |     number_of_pages = 30000
19 | 
20 |     custom_settings = {'AUTOTHROTTLE_ENABLED': True,
21 |                        'HTTPCACHE_ENABLED': True,
22 |                        'CONCURRENT_REQUESTS': 100,
23 |                        'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
24 |                        }
25 |     def __init__(self, gather_index_pages=False, **kwargs):
26 |         super().__init__(**kwargs)
27 |         self.start_urls = []
28 |         self.gather_index_pages = gather_index_pages
29 |         if not self.gather_index_pages:
30 |             self.start_urls = Path('virgool/index.txt').read_text().split('\n')
31 |         else:
32 |             for i in range(0, self.number_of_pages + 1):
33 |                 self.start_urls.append(
34 |                     f"https://virgool.io/?page={i}"
35 |                 )
36 | 
37 |         logger.info('urls are appended')
38 | 
39 |     def parse(self, response, **kwargs):
40 |         try:
41 |             if self.gather_index_pages:
42 |                 for item in range(1, 20):
43 |                     url = response.css(f'main#app article:nth-child({item}) > div > a::attr(href)').get()
44 |                     if url:
45 |                         with Path('virgool/index.txt').open("a") as f:
46 |                             f.write(url + '\n')
47 |             else:
48 |                 item = {'title': response.css('main#app h1::text').get(),
49 |                         'author': response.css('main#app div.module-header > a::text').get(),
50 |                         'text': extract(response.body.decode('utf-8'),deduplicate=True, include_images=False, include_comments=False, include_links=False),
51 |                         'url': response.css('.shorturl-text::text').get()
52 |                         }
53 | 
54 |                 # if I use strip on all of them I may get error. I have to check if it is not none.
55 |                 for key in item:
56 |                     if item[key]:
57 |                         item[key] = re.sub(' +', ' ', item[key]).strip()
58 | 
59 |                 return item
60 |         except Exception:
61 |             logger.error("Parsing Error: ", exc_info=True)
62 | 
63 |     def handle_error(self, failure):
64 |         logger.warning("Error,", failure.request.url)
65 |         yield scrapy.Request(
66 |             url=failure.request.url,
67 |             dont_filter=True,
68 |             callback=self.parse,
69 |             errback=self.handle_error)
70 | 


--------------------------------------------------------------------------------
/wikipedia.py:
--------------------------------------------------------------------------------
 1 | from asyncio import gather
 2 | import scrapy
 3 | import re
 4 | from logger import logger
 5 | 
 6 | import logging
 7 | from scrapy.utils.log import configure_logging 
 8 | from pathlib import Path
 9 | 
10 | class WikipediaSpider(scrapy.Spider):
11 |     name= "Wikipedia"
12 | 
13 |     main_url = "https://fa.wikipedia.org/"
14 | 
15 |     custom_settings = {'AUTOTHROTTLE_ENABLED':True,
16 |                         'HTTPCACHE_ENABLED':True, 
17 |                         'CONCURRENT_REQUESTS':30,
18 |                         'CONCURRENT_REQUESTS_PER_DOMAIN':30,
19 |                       }
20 | 
21 | 
22 |     configure_logging(install_root_handler=False)
23 |     logging.basicConfig(
24 |         filename='log.txt',
25 |         format='%(levelname)s: %(message)s',
26 |         level=logging.INFO
27 |     )
28 | 
29 |     def __init__(self, gather_index_pages=False):
30 |         # همه صفحه ها
31 |         # صفحه ایندکس ویکی پدیای فارسی
32 |         try:
33 |             self.start_urls = ["https://fa.wikipedia.org/w/index.php?title=%D9%88%DB%8C%DA%98%D9%87:%D8%AA%D9%85%D8%A7%D9%85_%D8%B5%D9%81%D8%AD%D9%87%E2%80%8C%D9%87%D8%A7"]
34 |             self.gather_index_pages = gather_index_pages
35 |             if not self.gather_index_pages:
36 |                 self.start_urls=Path('wikipedia/index.txt').read_text().split('\n')
37 |         except Exception:
38 |             logger.error('error', exc_info=True)
39 | 
40 |     def parse(self, response):
41 |         try:
42 |             next_page = response.css('div#mw-content-text > div:nth-child(2)> :contains("صفحهٔ بعد")::attr(href)').getall()
43 |             if self.gather_index_pages and next_page:
44 |                 logger.info(f"Next page {next_page}")
45 |                 
46 |                 with Path('wikipedia/index.txt').open("a") as f:
47 |                     f.write(self.main_url+next_page[0]+'\n')
48 | 
49 |                 yield scrapy.Request(
50 |                     self.main_url+next_page[0],
51 |                     callback=self.parse,
52 |                     dont_filter=True,
53 |                     errback=self.handle_failure,
54 |                 )
55 |                 
56 |             if not self.gather_index_pages:
57 |                 for article in response.css('div#mw-content-text > div.mw-allpages-body a::attr(href)').getall():
58 |                     yield scrapy.Request(
59 |                         self.main_url+article, 
60 |                         callback=self.parse_news,
61 |                     )
62 |         except Exception:
63 |             logger.error("Parsing Error: ", exc_info=True)
64 | 
65 |     def handle_failure(self, failure):
66 |         logger.warning("Error,", failure.request.url)
67 |         yield scrapy.Request(
68 |             url=failure.request.url,
69 |             dont_filter=True,
70 |             callback=self.parse,
71 |             errback=self.handle_failure)
72 | 
73 |     def parse_news(self, response):
74 |         try: 
75 |             item = {
76 |                 'title': response.css('#firstHeading *::text').get(),
77 |                 'content': ' '.join(response.css('div#mw-content-text > div.mw-parser-output > *:not(style):not(table)::text').getall()),
78 |                 'link': self.main_url+response.css('li#t-permalink > a::attr(href)').get(),
79 |             }
80 |             
81 |             for key in item:
82 |                 item[key] = re.sub(' +', ' ', item[key]).strip()
83 |             
84 |             yield item
85 | 
86 |         except Exception:
87 |             logger.error(f"Error {item}", exc_info=True)
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | *.csv


--------------------------------------------------------------------------------
/isna.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | from logger import logger
 4 | 
 5 | 
 6 | class IsnaSpider(scrapy.Spider):
 7 |     
 8 |     name = "Isna"
 9 |     start_urls = []
10 |     custom_settings = {'AUTOTHROTTLE_ENABLED':True,
11 |                         #'HTTPCACHE_ENABLED':True,  
12 |                         # enabling http_cache quickly finished the storage!
13 |                         #'CONCURRENT_REQUESTS':1000,
14 |                         #'CONCURRENT_REQUESTS_PER_DOMAIN':1000,
15 |                       }
16 | 
17 | 
18 |     main_url = "https://www.isna.ir"
19 |     
20 |     def __init__(self ,from_year = 1378, to_year = 1401):
21 |         # Set the range of values for the mn, dy, and yr parameters
22 |         mn_range = range(1,32) 
23 |         dy_range = range(1,13)
24 |         yr_range = range(int(from_year),int(to_year)+1)
25 |         pi_range= range(1,101)
26 |         # Create an empty list to store the start URLs
27 |         self.start_urls = []
28 | 
29 |         # Loop through the possible values for each parameter
30 |         for mn in mn_range:
31 |             for dy in dy_range:
32 |                 for yr in yr_range:
33 |                     for pi in pi_range:
34 |                         # Construct the URL using string formatting
35 |                         url = f"https://www.isna.ir/archive?pi={pi}&ms=0&dy={dy}&mn={mn}&yr={yr}"
36 |                         # Add the URL to the start_urls list
37 |                         self.start_urls.append(url)
38 | 
39 |         # Print the start URLs
40 |         logger.info('urls are appended')
41 | 
42 |     def handle_failure(self, failure):
43 |         logger.warning("Error,", failure.request.url)
44 |         yield scrapy.Request(
45 |             url=failure.request.url,
46 |             dont_filter=True,
47 |             callback=self.parse,
48 |             errback=self.handle_failure)
49 | 
50 |         
51 |     def parse(self, response ):
52 |         try:
53 |             # we have a problem in isna which is unsolved. If you go to page 50, for example, it might show you some news from previous days! 
54 |             # so acutally you should filter that out. 
55 |             for news in response.css("div.items a::attr(href)").getall():
56 |                 if len(news.strip()) > 0:
57 |                     yield scrapy.Request(
58 |                         self.main_url+news,
59 |                         callback=self.parse_news,
60 |                     )
61 |                     logger.info('added '+ self.main_url+news)
62 |         except Exception:
63 |             logger.error("Parsing Error: ", exc_info=True)
64 | 
65 |     def parse_news(self, response):
66 |         try: 
67 |             # I used pip install scrapy and scrapy shell to help me generate this content. 
68 |             # scrapy shell
69 |             # fetch(url)
70 |             # then use reponse.css
71 |             # also Copy CSS Selector was useful
72 |             item = {
73 |                 'title': response.css('article#item h1::text').get(),
74 |                 'shortlink': response.css('input#short-url::attr(value)').get(),
75 |                 'time':  response.css('article#item li:nth-child(1) > span.text-meta::text').get(),
76 |                 'service': response.css('article#item li:nth-child(2) > span.text-meta::text').get(),
77 |                 'news_id': response.css('article#item li:nth-child(3) > span.text-meta::text').get(),
78 |                 'reporter': response.css('article#item li:nth-child(1) > strong::text').get(), 
79 |                 'managers': response.css('article#item li:nth-child(2) > strong::text').get(),
80 |                 'body': ' '.join(response.css('article#item div.item-body *::text').getall()),
81 |             }
82 |             
83 |             # if I use strip on all of them I may get error. I have to check if it is not none. 
84 |             for key in item:
85 |                 if item[key]:
86 |                     item[key] = re.sub(' +', ' ', item[key]).strip()
87 |             
88 |             yield item
89 | 
90 |         except Exception:
91 |             logger.error("Error", exc_info=True)
92 | 


--------------------------------------------------------------------------------