├── paper_spiders ├── __init__.py ├── utils │ ├── __init__.py │ └── paperlist.py ├── spiders │ ├── __init__.py │ └── paper_spider.py ├── items.py ├── settings.py ├── pipelines.py └── middlewares.py ├── scrapy.cfg └── .gitignore /paper_spiders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper_spiders/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper_spiders/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = paper_spiders.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = paper_spiders 12 | -------------------------------------------------------------------------------- /paper_spiders/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class PaperSpidersItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /paper_spiders/spiders/paper_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.http import Response, Request 3 | from ..utils.paperlist import paper_list 4 | 5 | 6 | class PaperSpider(scrapy.Spider): 7 | name = "paper_spider" 8 | 9 | def start_requests(self): 10 | for p in paper_list: 11 | yield Request(url=p["url"], callback=self.parse, cb_kwargs=p) 12 | 13 | def parse(self, response: Response, **kwargs): 14 | table = response.xpath('//*[@id="event-overview"]/table') 15 | papers = table.xpath("tr/td[2]") 16 | for paper in papers: 17 | title = paper.xpath("a[1]/text()").get() 18 | author_list = paper.xpath('.//div[@class="performers"]/a') 19 | author = ", ".join([a.xpath("text()").get() for a in author_list]) 20 | yield {"conf": kwargs["conf"], "title": title, "author": author} 21 | -------------------------------------------------------------------------------- /paper_spiders/utils/paperlist.py: -------------------------------------------------------------------------------- 1 | paper_list = [ 2 | # 2020 3 | { 4 | "conf": "ICSE 2020", 5 | "url": "https://2020.icse-conferences.org/track/icse-2020-papers", 6 | }, 7 | {"conf": "FSE 2020", "url": "https://2020.esec-fse.org/track/fse-2020-papers"}, 8 | { 9 | "conf": "ASE 2020", 10 | "url": "https://conf.researchr.org/track/ase-2020/ase-2020-papers", 11 | }, 12 | { 13 | "conf": "ISSTA 2020", 14 | "url": "https://conf.researchr.org/track/issta-2020/issta-2020-papers", 15 | }, 16 | # 2021 17 | { 18 | "conf": "ICSE 2021", 19 | "url": "https://conf.researchr.org/track/icse-2021/icse-2021-papers", 20 | }, 21 | {"conf": "FSE 2021", "url": "https://2021.esec-fse.org/track/fse-2021-papers"}, 22 | { 23 | "conf": "ASE 2021", 24 | "url": "https://conf.researchr.org/track/ase-2021/ase-2021-papers", 25 | }, 26 | { 27 | "conf": "ISSTA 2021", 28 | "url": "https://conf.researchr.org/track/issta-2021/issta-2021-technical-papers", 29 | }, 30 | # 2022 31 | { 32 | "conf": "ICSE 2022", 33 | "url": "https://conf.researchr.org/track/icse-2022/icse-2022-papers", 34 | }, 35 | { 36 | "conf": "FSE 2022", 37 | "url": "https://2022.esec-fse.org/track/fse-2022-research-papers", 38 | }, 39 | { 40 | "conf": "ASE 2022", 41 | "url": "https://conf.researchr.org/track/ase-2022/ase-2022-research-papers", 42 | }, 43 | { 44 | "conf": "ISSTA 2022", 45 | "url": "https://conf.researchr.org/track/issta-2022/issta-2022-technical-papers", 46 | }, 47 | # 2023 48 | { 49 | "conf": "ICSE 2023", 50 | "url": "https://conf.researchr.org/track/icse-2023/icse-2023-technical-track", 51 | }, 52 | { 53 | "conf": "FSE 2023", 54 | "url": "https://2023.esec-fse.org/track/fse-2023-research-papers", 55 | }, 56 | { 57 | "conf": "ASE 2023", 58 | "url": "https://conf.researchr.org/track/ase-2023/ase-2023-papers", 59 | }, 60 | { 61 | "conf": "ISSTA 2023", 62 | "url": "https://2023.issta.org/track/issta-2023-technical-papers", 63 | }, 64 | # 2024 65 | { 66 | "conf": "ICSE 2024", 67 | "url": "https://conf.researchr.org/track/icse-2024/icse-2024-research-track", 68 | }, 69 | { 70 | "conf": "FSE 2024", 71 | "url": "https://2024.esec-fse.org/track/fse-2024-research-papers", 72 | }, 73 | {"conf": "ISSTA 2024", "url": "https://2024.issta.org/track/issta-2024-papers"}, 74 | { 75 | "conf": "ASE 2024", 76 | "url": "https://conf.researchr.org/track/ase-2024/ase-2024-research", 77 | }, 78 | # 2025 79 | { 80 | "conf": "ICSE 2025", 81 | "url": "https://conf.researchr.org/track/icse-2025/icse-2025-research-track", 82 | }, 83 | { 84 | "conf": "FSE 2025", 85 | "url": "https://conf.researchr.org/track/fse-2025/fse-2025-research-papers", 86 | }, 87 | { 88 | "conf": "ISSTA 2025", 89 | "url": "https://conf.researchr.org/track/issta-2025/issta-2025-papers", 90 | }, 91 | { 92 | "conf": "ASE 2025", 93 | "url": "https://conf.researchr.org/track/ase-2025/ase-2025-papers", 94 | }, 95 | # 2026 96 | { 97 | "conf": "ICSE 2026", 98 | "url": "https://conf.researchr.org/track/icse-2026/icse-2026-research-track", 99 | }, 100 | ] 101 | 102 | paper_url_list = [p["url"] for p in paper_list] 103 | -------------------------------------------------------------------------------- /paper_spiders/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for paper_spiders project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'paper_spiders' 11 | 12 | SPIDER_MODULES = ['paper_spiders.spiders'] 13 | NEWSPIDER_MODULE = 'paper_spiders.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'paper_spiders (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | #DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | #DEFAULT_REQUEST_HEADERS = { 41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42 | # 'Accept-Language': 'en', 43 | #} 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | #SPIDER_MIDDLEWARES = { 48 | # 'paper_spiders.middlewares.PaperSpidersSpiderMiddleware': 543, 49 | #} 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | #DOWNLOADER_MIDDLEWARES = { 54 | # 'paper_spiders.middlewares.PaperSpidersDownloaderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | #EXTENSIONS = { 60 | # 'scrapy.extensions.telnet.TelnetConsole': None, 61 | #} 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | ITEM_PIPELINES = { 66 | 'paper_spiders.pipelines.PaperToMarkdownPipeline': 300, 67 | } 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 71 | #AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED = True 85 | #HTTPCACHE_EXPIRATION_SECS = 0 86 | #HTTPCACHE_DIR = 'httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | 90 | # Set settings whose default value is deprecated to a future-proof value 91 | REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' 92 | TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' 93 | -------------------------------------------------------------------------------- /paper_spiders/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | import scrapy 6 | from itemadapter import ItemAdapter 7 | from typing import Dict, Tuple, List 8 | from .utils.paperlist import paper_list 9 | import os 10 | import orjson 11 | from functools import reduce 12 | 13 | 14 | def jsonline2md(jsonline: list[dict], header: List[str]) -> str: 15 | md = "" 16 | for h in header: 17 | md += f"| {h} " 18 | md += "|\n" 19 | for h in header: 20 | md += "| --- " 21 | md += "|\n" 22 | for j in jsonline: 23 | for h in header: 24 | md += f"| {j[h]} " 25 | md += "|\n" 26 | return md 27 | 28 | 29 | class PaperToMarkdownPipeline: 30 | def __init__(self): 31 | self.content = [] 32 | self.jsonl_path = "./paper_spiders/papers.jsonl" 33 | self.md_path = "./paper_spiders/papers.md" 34 | 35 | def _update_and_sort(self): 36 | if os.path.exists(self.jsonl_path): 37 | with open(self.jsonl_path, "r") as f: 38 | old_content = f.readlines() 39 | old_content = [orjson.loads(x) for x in old_content] 40 | self.content = self.content + old_content 41 | 42 | # some filter rules 43 | _content = [] 44 | title_set = set() 45 | for item in self.content: 46 | item["title"] = item["title"].replace("[Remote] ", "") 47 | if item["title"] in title_set: 48 | continue 49 | title_set.add(item["title"]) 50 | 51 | if item["conf"] == "" or item["title"] == "" or item["author"] == "" or item["title"].startswith("Q&A ("): 52 | continue 53 | 54 | _content.append( 55 | { 56 | "conf": item["conf"], 57 | "title": item["title"], 58 | "author": item["author"], 59 | } 60 | ) 61 | 62 | self.content = sorted( 63 | _content, 64 | key=lambda x: ( 65 | -1 * int(x["conf"].split(" ")[-1]), # year 66 | ["ICSE", "FSE", "ASE", "ISSTA"].index(x["conf"].split(" ")[0]), # series 67 | x["title"], # title 68 | ), 69 | ) 70 | 71 | with open(self.jsonl_path, "w") as f: 72 | for c in self.content: 73 | f.write(orjson.dumps(c).decode("utf-8") + "\n") 74 | 75 | def process_item(self, item: Dict, spider): 76 | conf, title, author = item["conf"], item["title"], item["author"] 77 | self.content.append({"conf": conf, "title": title, "author": author}) 78 | return item 79 | 80 | def open_spider(self, spider: scrapy.Spider): 81 | spider.log("spider open") 82 | pass 83 | 84 | def close_spider(self, spider): 85 | spider.log("spider close") 86 | self._update_and_sort() 87 | md = jsonline2md(self.content, ["conf", "title", "author"]) 88 | with open(self.md_path, "w") as f: 89 | f.write(md) 90 | 91 | # update the README.md 92 | with open("README.md", "r+") as f: 93 | readme = f.read() 94 | start_idx = readme.find("### Papers\n") 95 | end_idx = readme.find("\n### Acknowledgments\n") 96 | 97 | readme = readme[: start_idx + 11] + md + readme[end_idx:] 98 | readme = readme.replace("conf", "Conference").replace("title", "Title").replace("author", "Authors") 99 | f.seek(0) 100 | f.write(readme) 101 | f.truncate() 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | **/__pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /paper_spiders/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class PaperSpidersSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class PaperSpidersDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info("Spider opened: %s" % spider.name) 104 | --------------------------------------------------------------------------------