├── .gitignore ├── Makefile ├── README.md ├── author_cliques.py ├── check_author.py ├── docs └── index.html ├── parse_dblp.py ├── pubs.py ├── templates ├── cliques.html ├── top-authors.html └── top-index.html └── top_authors.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Downloaded files 2 | dblp.xml.gz 3 | dblp.dtd 4 | dblp-aliases.csv 5 | csrankings.csv 6 | 7 | # Created files 8 | pickle/* 9 | www/* 10 | __pycache__/* 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY=process topauthors cliques fresh all deploy 2 | 3 | all: fresh process 4 | # updated data and process new data 5 | 6 | process: topauthors cliques 7 | 8 | topauthors: 9 | python3 top_authors.py 10 | 11 | cliques: 12 | python3 author_cliques.py 13 | 14 | fresh: 15 | # freshen raw data files (checking timestamps) 16 | wget -N https://dblp.uni-trier.de/xml/dblp.xml.gz 17 | wget -N https://dblp.uni-trier.de/xml/dblp.dtd 18 | #wget -N https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/dblp-aliases.csv 19 | wget -N https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/csrankings.csv 20 | mkdir -p pickle 21 | mkdir -p www 22 | # get the pickling started 23 | python3 parse_dblp.py 24 | 25 | deploy: 26 | for i in www/*.html; do \ 27 | echo $$i ; \ 28 | gzip -f -9 -k $$i ; \ 29 | done 30 | unison -prefer=newer -batch www/ ssh://ghul.albtraum.org/pubstats/ 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Publication statistics 2 | 3 | This repository establishes simple statistics for a set of conferences. 4 | 5 | Using the DBLP data set, we extract the top conferences and then aggregate them 6 | on per-author basis. Based on different sub groups (e.g., security, embedded 7 | systems, or OS) we then calculate per author statistics in a nice overview. 8 | 9 | Processing happens in two stages: 10 | 11 | * `parse_dblp.py` extracts all publications and dumps them in a pickle files 12 | based on the per-area aggregation (this is slow as DBLP is a 3GB XML file). 13 | To be able to process such a large XML file, we use a stream processor that 14 | simply dumps interesting publications into `Pub` objects (see `pubs.py`). 15 | * `top_authors.py` leverages the pickle files to process per-area statistics 16 | and aggregate statistics. 17 | * `author_cliques` leverages the pickle files to calculate per-area author 18 | * cliques. 19 | 20 | 21 | ## Using/Howto 22 | 23 | * Easy mode: check out the [homepage](https://hexhive.epfl.ch/pubstats/) 24 | * `make all` to download DBLP data, pickle, and create the html data 25 | * `make fresh` to update DBLP data and pickle it 26 | * `make topauthors` to create the top author pages 27 | * `make cliques` to create the cliques 28 | 29 | 30 | ## Contributing 31 | 32 | Ideas, comments, or improvements are welcome! Please reach out to 33 | [Mathias Payer](mailto:mathias.payer@nebelwelt.net) to discuss. You can also 34 | reach out to [@gannimo on Twitter](https://www.twitter.com/gannimo). 35 | 36 | 37 | ## Changelog 38 | 39 | * 2023-08-21 random bugfixes and conference updates 40 | * 2023-02-06 adjusted SE/DB conferences based on feedback 41 | * 2021-02-09 fixed VLDB conference and added ICDE and PODS for the database 42 | community; added ASE and ISSTA for the software engineering community 43 | * 2021-01-11 added HPCA for architecture and adjusted paper length calculation for DAC 44 | * 2021-01-09 remove tutorials and short papers (by parsing pages data) 45 | * 2021-01-05 figures for overview page 46 | * 2021-01-04 new overview table across areas 47 | * 2021-01-02 added author cliques 48 | * 2020-12-30 first version with author statistics 49 | 50 | 51 | ## Acknowledgements 52 | 53 | This code and page was developed by [Mathias Payer](https://nebelwelt.net), 54 | initially over the 2020 holiday break. The site includes feedback and 55 | suggestions from too many to list, thank you for that! 56 | 57 | We use information from [DBLP](https://dblp.org/xml/) and 58 | [CSRankings](https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/dblp-aliases.csv) 59 | for anti-aliasing of authors. The idea for the statistics was inspired by 60 | [Davide's Software Security Circus](http://s3.eurecom.fr/~balzarot/notes/top4_2019/). 61 | 62 | 63 | ## License 64 | 65 | All data in this repository is licensed under 66 | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/). 67 | -------------------------------------------------------------------------------- /author_cliques.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import pickle 4 | import networkx as nx 5 | import matplotlib.pyplot as plt 6 | from math import ceil, sqrt 7 | from datetime import date 8 | 9 | from pubs import Pub, Author, CONFERENCES, AREA_TITLES 10 | 11 | def parse_author_cliques(pubs): 12 | authors = {} 13 | for pub in pubs: 14 | # create all author connections 15 | # iterate through each paper and count author1 -> author2 edges 16 | # we store the collected data in a hashmap of author1 -> author2 -> count 17 | for first_name in range(len(pub.authors)): 18 | for second_name in range(first_name + 1, len(pub.authors)): 19 | first_idx = first_name 20 | second_idx = second_name 21 | if pub.authors[first_name] > pub.authors[second_name]: 22 | second_idx = first_name 23 | first_idx = second_name 24 | if not pub.authors[first_idx] in authors: 25 | authors[pub.authors[first_idx]] = {} 26 | if not pub.authors[second_idx] in authors[pub.authors[first_idx]]: 27 | authors[pub.authors[first_idx]][pub.authors[second_idx]] = 0 28 | authors[pub.authors[first_idx]][pub.authors[second_idx]] += 1 29 | return authors 30 | 31 | def parse_graph(authors, num_edges = 10, fname = ''): 32 | G = nx.Graph() 33 | #plt.figure(num=None, figsize=(20, 20), dpi=100) 34 | #plt.axis('off') 35 | #fig = plt.figure(1) 36 | 37 | author_set = [] 38 | # count all authors with at least num_edges edges: 39 | for author1 in authors: 40 | for author2 in authors[author1]: 41 | if authors[author1][author2] >= num_edges: 42 | if author1 not in author_set: 43 | author_set.append(author1) 44 | G.add_node(author1) 45 | if author2 not in author_set: 46 | author_set.append(author2) 47 | G.add_node(author2) 48 | G.add_edge(author1, author2, weight=authors[author1][author2]) 49 | # break large graph into subgraphs (for disconnected parts) 50 | sub_graphs = list(G.subgraph(c) for c in nx.connected_components(G)) 51 | # if, insted of sub graphs, we want one graph for all cliques, use the following code: 52 | ##pos=nx.spring_layout(G, k=5, scale=9, iterations=500) 53 | ##nx.draw_networkx_nodes(G, pos) 54 | ##nx.draw_networkx_labels(G, pos) 55 | ##nx.draw_networkx_edges(G, pos) 56 | nr_cliques = len(sub_graphs) 57 | print('Found {} cliques'.format(nr_cliques)) 58 | x_cliques = ceil(sqrt(nr_cliques)) 59 | y_cliques = ceil(nr_cliques / x_cliques) 60 | fig, axes = plt.subplots(nrows=x_cliques, ncols=y_cliques, figsize=(20,20)) 61 | #fig.suptitle(fname) 62 | if (len(sub_graphs) == 1): 63 | ax = [axes] 64 | else: 65 | ax = axes.flatten() 66 | for i in range(len(sub_graphs)): 67 | # create a graph layout based on feedback for each sub graph 68 | pos=nx.spring_layout(sub_graphs[i], k=0.1, scale=0.6, iterations=80) 69 | nx.draw_networkx_nodes(sub_graphs[i], pos, ax=ax[i]) 70 | nx.draw_networkx_labels(sub_graphs[i], pos, ax=ax[i]) 71 | nx.draw_networkx_edges(sub_graphs[i], pos, ax=ax[i]) 72 | # adjust borders for sub graph (because we have long lables that spill) 73 | xmin, xmax, ymin, ymax = ax[i].axis() 74 | ax[i].set(xlim = (xmin-0.5, xmax+0.5), ylim=(ymin, ymax)) 75 | ax[i].set_axis_off() 76 | # disable axes for blank subplots (if we have remaining space) 77 | for i in range(len(sub_graphs), x_cliques*y_cliques): 78 | ax[i].set_axis_off() 79 | fig.tight_layout() 80 | 81 | # adjust borders for single graph case 82 | #l,r = plt.xlim() 83 | #plt.xlim(l-1, r+1) 84 | #t, b = plt.ylim() 85 | #plt.ylim(t-1, b+1) 86 | plt.savefig('www/'+fname, bbox_inches="tight") 87 | #plt.show() 88 | del fig 89 | 90 | if __name__ == '__main__': 91 | all_pubs = [] 92 | AREAPUBS = 10 93 | ALLPUBS = 20 94 | for area in CONFERENCES: 95 | # Load pickeled data 96 | with open('pickle/pubs-{}.pickle'.format(area), 'rb') as f: 97 | pubs = pickle.load(f) 98 | f.close() 99 | all_pubs += pubs 100 | 101 | # Prepare per-author information 102 | authors = parse_author_cliques(pubs) 103 | print('Analyzed a total of {} authors'.format(len(authors))) 104 | 105 | # Create and draw graph 106 | parse_graph(authors, num_edges=AREAPUBS, fname=area) 107 | 108 | # Prepare per-author information 109 | authors = parse_author_cliques(all_pubs) 110 | print('Analyzed a total of {} authors'.format(len(authors))) 111 | 112 | # Create and draw graph 113 | parse_graph(authors, num_edges=ALLPUBS, fname='all') 114 | 115 | content = '
' 116 | for area in CONFERENCES: 117 | content = content + '

Cliques in '+AREA_TITLES[area]+'



' 118 | content = content + '

All cliques



' 119 | 120 | template = open('templates/cliques.html', 'r').read() 121 | template = template.replace('XXXTITLEXXX', 'Author cliques') 122 | template = template.replace('XXXCONTENTXXX', content) 123 | template = template.replace('XXXSHAREDXXX', str(AREAPUBS)) 124 | template = template.replace('XXXSHARED2XXX', str(ALLPUBS)) 125 | template = template.replace('XXXDATEXXX', date.today().strftime("%Y-%m-%d")) 126 | fout = open('www/cliques.html', 'w') 127 | fout.write(template) 128 | -------------------------------------------------------------------------------- /check_author.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import pickle 4 | from statistics import median, mean 5 | from datetime import date 6 | import csv 7 | import matplotlib.pyplot as plt 8 | import sys 9 | 10 | from pubs import Pub, Author, CONFERENCES, CONFERENCES_SHORT, AREA_TITLES 11 | from top_authors import parse_authors 12 | 13 | if __name__ == '__main__': 14 | all_pubs = [] 15 | top_values = {} 16 | if len(sys.argv) != 2: 17 | print('Print all publications of an author. Call this script with {} "NAME"'.format(sys.argv[0])) 18 | exit(1) 19 | for area in CONFERENCES: 20 | # Load pickeled data 21 | with open('pickle/pubs-{}.pickle'.format(area), 'rb') as f: 22 | pubs = pickle.load(f) 23 | f.close() 24 | all_pubs += pubs 25 | 26 | print('# {}\'s publications in {}'.format(sys.argv[1], area)) 27 | 28 | auth_pubs, _, _ = parse_authors(pubs) 29 | if sys.argv[1] not in auth_pubs: 30 | continue 31 | 32 | author = auth_pubs[sys.argv[1]] 33 | for year in sorted(author.years): 34 | for pub in author.pubs[year]: 35 | print('{}, {}, {}'.format(pub.title, pub.venue, year)) 36 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Pubstats 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /parse_dblp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import lxml.etree as ET 4 | from gzip import GzipFile 5 | import pickle 6 | import csv 7 | import re 8 | 9 | from pubs import Pub, Author, CONFERENCES, CONFERENCES_NUMBER 10 | 11 | MIN_PAPER_PAGES = 6 12 | 13 | def get_nr_pages(pages, title, venue, year): 14 | start = '' 15 | end = '' 16 | addon = 0 17 | # we don't know, so assume it's a paper 18 | if pages == '': 19 | # special casing 20 | if venue == 'USENIX Security Symposium': 21 | return MIN_PAPER_PAGES 22 | if venue == 'USENIX Annual Technical Conference' and (year==1998 or year==2007 or year==2009 or year==2010 or year==2011 or year==2016 or year==2017 or year==2019): 23 | return MIN_PAPER_PAGES 24 | if venue == 'USENIX Annual Technical Conference, General Track' and (year==2006): 25 | return MIN_PAPER_PAGES 26 | if venue == 'USENIX ATC' and (year==2011): 27 | return MIN_PAPER_PAGES 28 | if venue == 'FAST' and (year==2003 or year==2005 or year==2007): 29 | return MIN_PAPER_PAGES 30 | if venue == 'DAC' and (year<=1980): 31 | return MIN_PAPER_PAGES 32 | if venue == 'OSDI' and (year==2002): 33 | return MIN_PAPER_PAGES 34 | if venue == 'ICCAD' and (year==2001): 35 | return MIN_PAPER_PAGES 36 | if venue == 'MobiSys' and (year==2003 or year==2004): 37 | return MIN_PAPER_PAGES 38 | if venue == 'NDSS': 39 | # TODO this includes NDSS keynotes as papers. 40 | # The lack of an tag in the same inproceedings entry may indicate that it's a keynote (checked for 01) 41 | return MIN_PAPER_PAGES 42 | if venue == 'NSDI' and (year==2005 or year==2006 or year==2007 or year==2011 or year==2024): 43 | return MIN_PAPER_PAGES 44 | if venue == 'SC' and (year==2009): 45 | return MIN_PAPER_PAGES 46 | if venue == 'VLDB' and (year==2001 or year==2002): 47 | return MIN_PAPER_PAGES 48 | if title.startswith('Front Matter') or title.startswith('Letter from') or title.startswith('Message from') or title.startswith('Session details') or title.startswith('Welcome Message'): 49 | return 0 50 | print('No pages: "{}" ({}, {})'.format(title, venue, year)) 51 | return 0 52 | # find from/to delimeter (or assume it's just one page) 53 | if pages.find('-') != -1: 54 | start = pages[0:pages.find('-')] 55 | end = pages[pages.find('-')+1:] 56 | # special casing 57 | if venue == 'HPDC' and (year==2001 or year==2002) and end=='': 58 | return MIN_PAPER_PAGES 59 | if venue == 'ICCAD' and (year==2001) and end=='': 60 | return MIN_PAPER_PAGES 61 | if venue == 'IEEE Symposium on Security and Privacy' and (year==2004 or year==2003) and end=='': 62 | return MIN_PAPER_PAGES 63 | if venue == 'ISCA' and (year==2002) and end=='': 64 | return MIN_PAPER_PAGES 65 | else: 66 | return 1 67 | if pages.startswith('i-'): 68 | return 1 69 | # check for format 90:1-90:28 (e.g., used in journals) 70 | if start.find(':') != -1: 71 | start = start[start.find(':')+1:] 72 | if end.find(':') != -1: 73 | end = end[end.find(':')+1:] 74 | # if we have two ranges, recurse 75 | if start.find(',') != -1: 76 | addon = get_nr_pages(start[start.find(',')+1:].strip(), title, venue, year) 77 | start = start[0:start.find(',')] 78 | if end.find(',') != -1: 79 | addon = get_nr_pages(end[end.find(',')+1:].strip(), title, venue, year) 80 | end = end[0:end.find(',')] 81 | if not start.isnumeric() or not end.isnumeric(): 82 | print('Non-numeric characters: "{}" {} ({}, {})'.format(pages, title, venue, year)) 83 | start = re.sub('[^0-9]','', start) 84 | end = re.sub('[^0-9]','', end) 85 | # double check that none of the ranges are empty 86 | if start=='' or end=='': 87 | print('Single page: "{}" {} ({}, {})'.format(pages, title, venue, year)) 88 | return 1 89 | return int(end) - int(start) + addon + 1 90 | 91 | def parse_dblp(dblp_file = './dblp.xml.gz'): 92 | pubs = {} 93 | for area in CONFERENCES: 94 | pubs[area] = [] 95 | in_pub = False # flag marking if we're parsing a publication 96 | total_pub = 0 97 | selected_pub = 0 98 | authors = [] 99 | title = '' 100 | venue = '' 101 | number = '' 102 | pages = '' 103 | year = 1900 104 | unhandled_venues = set() 105 | 106 | # author affiliations 107 | affiliations = {} 108 | all_authors = set() # authors of our selected conferences 109 | author_homepage = '' 110 | author_affiliation = '' 111 | total_affiliations = 0 112 | in_www = False # flag marking if we're parsing affiliation information 113 | 114 | # author aliases 115 | aliases = {} 116 | 117 | dblp_stream = GzipFile(filename=dblp_file) 118 | # Writing streaming XML parsers is fun... 119 | for event, elem in ET.iterparse(dblp_stream, events = ('start', 'end',), load_dtd = True): 120 | # mark header tags 121 | if event == 'start': 122 | if elem.tag == 'inproceedings' or elem.tag == 'article': 123 | in_pub = True 124 | if elem.tag == 'www': 125 | in_www = True 126 | # process individual closing tags 127 | if event == 'end': 128 | if in_pub and elem.tag == 'title': 129 | title = elem.text 130 | elif in_pub and (elem.tag == 'booktitle' or elem.tag == 'journal'): 131 | venue = elem.text 132 | elif in_pub and elem.tag == 'number': 133 | number = elem.text 134 | elif in_pub and elem.tag == 'pages': 135 | pages = elem.text 136 | elif in_pub and elem.tag == 'year': 137 | year = int(elem.text) 138 | # author is needed both for affiliations and pubs 139 | elif (in_pub or in_www) and elem.tag == 'author': 140 | authors.append(elem.text) 141 | elif in_www and elem.tag=='url': 142 | if author_homepage == '': 143 | author_homepage = elem.text 144 | elif in_www and elem.tag=='note' and elem.get('type') == 'affiliation': 145 | # note: we only record the first affiliation of an author in the list 146 | if author_affiliation == '' and elem.text != None: 147 | author_affiliation = elem.text 148 | elif elem.tag == 'inproceedings' or elem.tag == 'article': 149 | for area in CONFERENCES: 150 | if venue in CONFERENCES[area] or (venue in CONFERENCES_NUMBER[area] and number in CONFERENCES_NUMBER[area][venue]): 151 | if get_nr_pages(pages, title, venue, year) >= MIN_PAPER_PAGES: 152 | selected_pub += 1 153 | pubs[area].append(Pub(venue, title, authors, year)) 154 | for author in authors: 155 | if not author in all_authors: 156 | all_authors.add(author) 157 | elif venue.find(' (') != -1 and venue[0:venue.find(' (')] in CONFERENCES[area]: 158 | unhandled_venues.add(venue) 159 | total_pub += 1 160 | authors = [] 161 | number = '' 162 | title = '' 163 | pages = '' 164 | year = 0 165 | venue = '' 166 | in_pub = False 167 | elif elem.tag == 'www': 168 | # Process an author affiliation (if available) 169 | if len(authors) >= 1: 170 | # record affiliation 171 | if author_affiliation.find(',') != -1: 172 | author_affiliation = author_affiliation[0:author_affiliation.find(',')].strip() 173 | affiliations[authors[0]] = (author_affiliation, author_homepage, '') # affil, homepage, google scholar 174 | total_affiliations += 1 175 | # does this author have aliases? 176 | if len(authors) > 1: 177 | for i in range(1, len(authors)): 178 | aliases[authors[i]] = authors[0] 179 | # clean for next iteration 180 | author_affiliation = '' 181 | author_homepage = '' 182 | authors = [] 183 | in_www = False 184 | elem.clear() 185 | 186 | # prune authors that have not published at our conferences of interest 187 | kill_list = [] 188 | for author in affiliations: 189 | if author not in all_authors: 190 | kill_list.append(author) 191 | for author in kill_list: 192 | del affiliations[author] 193 | for venue in unhandled_venues: 194 | print("Unhandled partial match for venue: {}".format(venue)) 195 | 196 | return (pubs, affiliations, aliases, total_pub, selected_pub, total_affiliations) 197 | 198 | def remove_aliases(confs, aliases): 199 | # parse aliases from CSrankins 200 | #aliases = {} 201 | #with open('dblp-aliases.csv', 'r') as f: 202 | # csvaliases = csv.reader(f) 203 | # for row in csvaliases: 204 | # if row[0] == 'alias': 205 | # continue 206 | # aliases[row[0]] = row[1] 207 | # update all publications with aliased authors 208 | aliases_replaced = 0 209 | for area in confs: 210 | for pub in confs[area]: 211 | for i in range(len(pub.authors)): 212 | if pub.authors[i] in aliases: 213 | pub.authors[i] = aliases[pub.authors[i]] 214 | aliases_replaced += 1 215 | print('Replaced {} aliases'.format(aliases_replaced)) 216 | 217 | if __name__ == '__main__': 218 | # Parse security conferences 219 | pubs, affiliations, aliases, total_pub, selected_pub, total_affiliations = parse_dblp() 220 | print('Selected a grand total of {} out of {} publications'.format(selected_pub, total_pub)) 221 | print('Selected a grand total of {} out of {} authors (with affiliations)'.format(len(affiliations), total_affiliations)) 222 | 223 | # Remove aliases 224 | remove_aliases(pubs, aliases) 225 | 226 | # Dump publications into pickle file 227 | for area in pubs: 228 | with open('pickle/pubs-{}.pickle'.format(area), 'wb') as f: 229 | pickle.dump(pubs[area], f) 230 | f.close() 231 | # Dump affiliations into pickle file 232 | with open('pickle/affiliations.pickle', 'wb') as f: 233 | pickle.dump(affiliations, f) 234 | f.close() 235 | -------------------------------------------------------------------------------- /pubs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # Security conference abbreviations 4 | # booktitle: AsiaCCS 5 | # booktitle: CCS 6 | # booktitle: CODASPY 7 | # booktitle: WOOT 8 | # booktitle: USENIX Security Symposium 9 | # booktitle: NDSS 10 | # booktitle: EuroS&P 11 | # booktitle: USENIX Annual Technical Conference 12 | # booktitle: RTSS 13 | # booktitle: DIMVA 14 | # booktitle: OSDI 15 | # booktitle: ESORICS 16 | # booktitle: IEEE Symposium on Security and Privacy 17 | 18 | # check out https://github.com/emeryberger/CSrankings/blob/gh-pages/filter.xq for conference names 19 | 20 | CONFERENCES = { 21 | 'sys_arch': ['ASPLOS', 'ASPLOS (1)', 'ASPLOS (2)', 'ASPLOS (3)', 'ISCA', 'MICRO', 'MICRO (1)', 'MICRO (2)', 'HPCA'], 22 | 'sys_net': ['SIGCOMM', 'NSDI'], 23 | 'sys_sec': ['CCS', 'ACM Conference on Computer and Communications Security', 'USENIX Security', 'USENIX Security Symposium', 'NDSS', 'IEEE Symposium on Security and Privacy', 'SP', 'S&P'], # note: SP added temporarily because Oakland'23 booktitle 24 | #'sys_db': ['SIGMOD Conference', 'VLDB', 'PVLDB', 'Proc. VLDB Endow.', 'ICDE', 'PODS'], 25 | 'sys_db': ['SIGMOD Conference', 'VLDB', 'PVLDB', 'Proc. VLDB Endow.'], 26 | 'sys_design': ['DAC', 'ICCAD'], 27 | 'sys_embed': ['EMSOFT', 'RTAS', 'RTSS'], 28 | 'sys_hpc': ['HPDC', 'ICS', 'SC'], 29 | 'sys_mob': ['MobiSys', 'MobiCom', 'MOBICOM', 'SenSys'], 30 | 'sys_mes': ['IMC', 'Internet Measurement Conference', 'Proc. ACM Meas. Anal. Comput. Syst.'], 31 | 'sys_os': ['SOSP', 'OSDI', 'EuroSys', 'USENIX Annual Technical Conference', 'USENIX Annual Technical Conference, General Track', 'USENIX ATC', 'USENIX ATC, General Track', 'FAST'], 32 | 'sys_pl': ['PLDI', 'POPL', 'ICFP', 'OOPSLA', 'OOPSLA/ECOOP'], 33 | #'sys_se': ['SIGSOFT FSE', 'ESEC/SIGSOFT FSE', 'ICSE', 'ICSE (1)', 'ICSE (2)', 'ASE', 'ISSTA'], 34 | 'sys_se': ['SIGSOFT FSE', 'ESEC/SIGSOFT FSE', 'ICSE', 'ICSE (1)', 'ICSE (2)'], 35 | } 36 | 37 | CONFERENCES_NUMBER = { 38 | 'sys_arch': {}, 39 | 'sys_net': {}, 40 | 'sys_sec': {}, 41 | 'sys_db': {}, 42 | 'sys_design': {}, 43 | 'sys_embed': {}, 44 | 'sys_hpc': {}, 45 | 'sys_mob': {}, 46 | 'sys_mes': {}, 47 | 'sys_os': {}, 48 | 'sys_pl': {'Proc. ACM Program. Lang.' : ['POPL', 'OOPSLA', 'OOPSLA1', 'OOPSLA2', 'ICFP']}, 49 | 'sys_se': {} 50 | } 51 | 52 | CONFERENCES_SHORT = { 53 | 'sys_arch': ['ASPLOS', 'ISCA', 'MICRO', 'HPCA'], 54 | 'sys_net': ['SIGCOMM', 'NSDI'], 55 | 'sys_sec': ['ACM CCS', 'USENIX Security', 'NDSS', 'IEEE SSP (Oakland)'], 56 | #'sys_db': ['SIGMOD', 'VLDB', 'ICDE', 'PODS'], 57 | 'sys_db': ['SIGMOD', 'VLDB'], 58 | 'sys_design': ['DAC', 'ICCAD'], 59 | 'sys_embed': ['EMSOFT', 'RTAS', 'RTSS'], 60 | 'sys_hpc': ['HPDC', 'ICS', 'SC'], 61 | 'sys_mob': ['MobiSys', 'MobiCom', 'SenSys'], 62 | 'sys_mes': ['IMC', 'SIGMETRICS'], 63 | 'sys_os': ['SOSP', 'OSDI', 'EuroSys', 'USENIX ATC', 'FAST'], 64 | 'sys_pl': ['PLDI', 'POPL', 'ICFP', 'OOPSLA'], 65 | #'sys_se': ['FSE', 'ICSE', 'ASE', 'ISSTA'], 66 | 'sys_se': ['FSE', 'ICSE'], 67 | } 68 | 69 | AREA_TITLES = { 70 | 'sys_arch': 'Systems: Architecture', 71 | 'sys_net': 'Systems: Networks', 72 | 'sys_sec': 'Systems: Security', 73 | 'sys_db': 'Systems: Databases', 74 | 'sys_design': 'Systems: Design', 75 | 'sys_embed': 'Embedded Systems', 76 | 'sys_hpc': 'Systems: HPC', 77 | 'sys_mob': 'Mobile Systems', 78 | 'sys_mes': 'Systems: Measurements', 79 | 'sys_os': 'Systems: OS', 80 | 'sys_pl': 'Systems: Programming Languages', 81 | 'sys_se': 'Systems: Software Engineering', 82 | 'sys': 'All Areas' 83 | } 84 | 85 | class Pub(): 86 | def __init__(self, venue, title, authors, year): 87 | self.venue = venue 88 | self.title = title 89 | self.authors = authors 90 | self.year = year 91 | #print('{} {} {} {}\n'.format(authors, year, venue, title)) 92 | 93 | class Author(): 94 | def __init__(self, name, aux_data): 95 | self.name = name 96 | self.years = {} 97 | self.nr_authors_year = {} 98 | self.venues = [] 99 | self.normalized_pubs = {} 100 | self.pubs = {} 101 | self.affiliation, self.homepage, self.scholar = aux_data 102 | 103 | def add_norm_area(self, year, fraction): 104 | if not year in self.normalized_pubs: 105 | self.normalized_pubs[year] = 0 106 | self.normalized_pubs[year] += fraction 107 | 108 | def add_publication(self, venue, year, title, authors): 109 | if not year in self.years: 110 | self.years[year] = 0 111 | self.nr_authors_year[year] = [] 112 | self.pubs[year] = [] 113 | self.years[year] += 1 114 | self.nr_authors_year[year].append(len(authors)) 115 | self.pubs[year].append(Pub(venue, title, authors, year)) 116 | 117 | if not venue in self.venues: 118 | self.venues.append(venue) 119 | 120 | def get_total(self): 121 | return sum(self.years.values()) 122 | 123 | if __name__ == '__main__': 124 | print('Nothing to see here, move along...') 125 | -------------------------------------------------------------------------------- /templates/cliques.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | XXXTITLEXXX 10 | 11 | 12 | 13 | 14 | 15 | 52 | 53 | 54 |
55 |
56 |

XXXTITLEXXX

57 |
    58 |
  • This list contains authors cliques with at least XXXSHAREDXXX shared publications.
  • 59 |
  • For the combined set, authors co-authored at least XXXSHARED2XXX publications.
  • 60 |
  • Data was processed on XXXDATEXXX.
  • 61 |
62 |
63 |
64 | XXXCONTENTXXX 65 |
66 |
67 | -------------------------------------------------------------------------------- /templates/top-authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | XXXTITLEXXX 10 | 11 | 12 | 13 | 14 | 15 | 52 | 53 | 54 |
55 |
56 |

XXXTITLEXXX

57 |
    58 |
  • This list contains only authors with at least 3 publications.
  • 59 |
  • The top conferences are: XXXTOPCONSXXX
  • 60 |
  • (A) shows the median number of authors across all publications.
  • 61 |
  • (A5) shows the median number of authors of the last 5 years.
  • 62 |
  • (Rel) shows the overall relative number of papers an author wrote, e.g., if you wrote 2 papers with 3 authors each you would get 2 times 1/3, i.e., 2/3.
  • 63 |
  • (Rel5) shows the relative number of papers an author wrote for the last 5 years.
  • 64 |
  • Click on the column name to sort.
  • 65 |
  • Back to the main pubstat page. 66 |
  • Data was processed on XXXDATEXXX.
  • 67 |
68 |
69 | 70 | XXXCONTENTXXX 71 |
72 |
73 | -------------------------------------------------------------------------------- /templates/top-index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Top Authors, the Systems Circus 10 | 11 | 12 | 13 | 14 | 55 | 56 | 57 |
58 |
59 |

An overview of the Systems Circus


60 |
    61 |
  • This page highlights the system circus, including the system security circus.
  • 62 |
  • The focus of these statistics is on authors and top tier publications.
  • 63 |
  • All code is available on the HexHive GitHub.
  • 64 |
  • Do you have ideas, comments, or improvements? Reach out to Mathias Payer.
  • 65 |
  • Data was processed on XXXDATEXXX.
  • 66 |
67 |

Key motivation for this page is to provide an overview of the systems community, its publications, its authors, and its cliques.

68 |

The data is sourced from DBLP (for publication data) and CSRankings (for affiliation data). The scripts process the data 69 | and create the necessary files for the web page, allowing a fast turnaround whenever new data is added to DBLP.

70 |
71 |

Area statistics

72 |
    73 |
  • Per-area statistics for the major systems conferences.
  • 74 |
  • We single out total papers, maximum and average papers of an author (max/a and avg/a), average for the top 50 authors (a50), and active authors (a)
  • 75 |
  • The total number of publications increases steadily for most areas, with security and mobile having the largest increases.
  • 76 |
  • The maximum number of publications of a single author in a year is slowly increasing, somewhat in line with the increase of the total number of authors per paper.
  • 77 |
  • As a top author, the number of papers across all areas of systems have increased from around 4-5 to 7-8 papers per year.
  • 78 |
  • The total number of active authors in a year is steadily increasing, with security and design taking the lead.
  • 79 |
80 | XXXAREAIMGSXXX 81 | 82 | XXXAREASTATSXXX 83 |
84 |
85 |

Individual per-author statistics for different areas

86 |
    87 | XXXCONTENTXXX 88 |
89 |
90 |

Author/collaboration cliques for different areas

91 | 94 |
95 |
96 | 97 | -------------------------------------------------------------------------------- /top_authors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import pickle 4 | from statistics import median, mean 5 | from datetime import date 6 | import csv 7 | import matplotlib.pyplot as plt 8 | 9 | from pubs import Pub, Author, CONFERENCES, CONFERENCES_SHORT, AREA_TITLES 10 | 11 | # TODO normalize data on per-area basis 12 | # Tried for different sub areas but the areas are similar enough so that 13 | # normalization does not change much (and normalization opens up questions 14 | # about interpretation). 15 | 16 | # break publications (on venue basis) into per-author statistics 17 | def parse_authors(pubs): 18 | authors = {} 19 | # Load aux data from cs rankings first 20 | aux_data = {} 21 | max_year = 0 22 | total_pubs = {} 23 | with open('csrankings.csv', 'r') as f: 24 | csvaliases = csv.reader(f) 25 | for row in csvaliases: 26 | if row[0] == 'alias': 27 | continue 28 | if row[0].find('[') != -1: 29 | name = row[0][0:row[0].find('[')-1] 30 | else: 31 | name = row[0] 32 | aux_data[name] = (row[1], row[2], row[3]) 33 | # Load aux data as parsed from DBLP (as fallback) 34 | with open('pickle/affiliations.pickle', 'rb') as f: 35 | aux_data2 = pickle.load(f) 36 | f.close() 37 | # parse pubs and split into authors 38 | for pub in pubs: 39 | # basic statistics 40 | if not pub.year in total_pubs: 41 | total_pubs[pub.year] = 0 42 | total_pubs[pub.year] += 1 43 | # break up into authors 44 | for name in pub.authors: 45 | if name not in authors: 46 | if name in aux_data: 47 | authors[name] = Author(name, aux_data[name]) 48 | elif name in aux_data2: 49 | authors[name] = Author(name, aux_data2[name]) 50 | else: 51 | authors[name] = Author(name, ('', '', '')) 52 | authors[name].add_publication(pub.venue, pub.year, pub.title, pub.authors) 53 | if pub.year > max_year: 54 | max_year = pub.year 55 | # now aggreate author data 56 | per_year_authors = {} 57 | for name in authors: 58 | for year in authors[name].years: 59 | if not year in per_year_authors: 60 | per_year_authors[year] = [] 61 | if not name in per_year_authors[year]: 62 | per_year_authors[year].append(name) 63 | per_author_pubs_years = {} 64 | for name in authors: 65 | for year in authors[name].years: 66 | if not year in per_author_pubs_years: 67 | per_author_pubs_years[year] = [] 68 | per_author_pubs_years[year].append(authors[name].years[year]) 69 | # aggregate top N values and return yearly medians 70 | top_values = {} 71 | for year in per_author_pubs_years: 72 | # year = (total, max, median, average) 73 | top100mean = round(mean(sorted(per_author_pubs_years[year], reverse=True)[0:50])*100)/100 74 | top_values[year] = (total_pubs[year], max(per_author_pubs_years[year]), round(mean(per_author_pubs_years[year])*100)/100, top100mean, len(per_year_authors[year])) 75 | 76 | return (authors, max_year, top_values) 77 | 78 | 79 | def top_authors(authors, cons='', title='Top Authors', tname='templates/top-authors.html', fname='www/top-authors.html', nr_years=20): 80 | ranked = {} 81 | current_year = 0 # max year we have data of 82 | 83 | # walk through all authors and sort by class/ranking 84 | for name in authors: 85 | total = authors[name].get_total() 86 | if total > 2: 87 | if total not in ranked: 88 | ranked[total] = [] 89 | ranked[total].append(authors[name]) 90 | if max(authors[name].years.keys()) > current_year: 91 | current_year = max(authors[name].years.keys()) 92 | 93 | author_entry = ''' 94 | {} 95 | {} 96 | {} 97 | {} 98 | {} 99 | {} 100 | {} 101 | {} 102 | {}''' 103 | author_head = ''' 104 | 105 | Rank 106 | Name 107 | Affiliation 108 | Total 109 | (A) 110 | (Rel)''' 111 | author_head = author_head + '' + str(current_year-2004) + '-' + str(current_year-2000) + '' 112 | author_head = author_head + '(A5)(Rel5)' 113 | 114 | for year in range(current_year, current_year-nr_years, -1): 115 | author_head = author_head + '' + str(year-2000) + '' 116 | author_entry += '{}' 117 | author_head += '<='+str(current_year-2000-nr_years)+'' 118 | author_entry += '{}' 119 | 120 | author_head += '' 121 | author_entry += '' 122 | 123 | content = author_head 124 | rank = 1 125 | prev_rank = 1 126 | alt_class = '' 127 | for number in sorted(ranked.keys(), reverse = True): 128 | for author in ranked[number]: 129 | if prev_rank != rank: 130 | prev_rank = rank 131 | if alt_class == '': 132 | alt_class = ' class="alt"' 133 | else: 134 | alt_class = '' 135 | values = [alt_class, rank, author.name, author.affiliation, number] 136 | 137 | # Calculate median 138 | median_data = [] 139 | median_data5 = [] 140 | rel = 0.0 141 | rel5 = 0.0 142 | for year in author.nr_authors_year: 143 | median_data = median_data + author.nr_authors_year[year] 144 | for nr in author.nr_authors_year[year]: 145 | rel += 1/nr 146 | if year > current_year-5: 147 | rel5 += 1/nr 148 | if year > current_year-5: 149 | median_data5 = median_data5 + author.nr_authors_year[year] 150 | med = median(median_data) 151 | 152 | values.append(round(med)) 153 | values.append('{:.2f}'.format(rel)) 154 | 155 | # summary of last 5 years 156 | recent = 0 157 | for year in author.years.keys(): 158 | if year > current_year-5: 159 | recent += author.years[year] 160 | if recent == 0: 161 | values.append('') 162 | else: 163 | values.append(recent) 164 | 165 | if len(median_data5) != 0: 166 | med5 = round(median(median_data5)) 167 | else: 168 | med5 = '' 169 | values.append(med5) 170 | 171 | if recent == 0: 172 | values.append('') 173 | else: 174 | values.append('{:.2f}'.format(rel5)) 175 | 176 | # last 20 years individually 177 | for year in range(current_year, current_year-nr_years, -1): 178 | if year not in author.years: 179 | values.append('') 180 | else: 181 | values.append(author.years[year]) 182 | 183 | # add ancient years 184 | ancient = 0 185 | for year in author.years.keys(): 186 | if year <= current_year-nr_years: 187 | ancient += author.years[year] 188 | if ancient == 0: 189 | values.append('') 190 | else: 191 | values.append(ancient) 192 | content += author_entry.format(*values) 193 | rank += len(ranked[number]) 194 | 195 | template = open(tname, 'r').read() 196 | template = template.replace('XXXTITLEXXX', title) 197 | template = template.replace('XXXCONTENTXXX', content) 198 | template = template.replace('XXXDATEXXX', date.today().strftime("%Y-%m-%d")) 199 | template = template.replace('XXXTOPCONSXXX', cons) 200 | fout = open(fname, 'w') 201 | fout.write(template) 202 | 203 | def stat_table(top_values, max_year, nr_years=20): 204 | table_head = 'AreaTotal' 205 | table_entry = '{}{}' 206 | for year in range(max_year, max_year-nr_years, -1): 207 | table_head += ''+str(year-2000)+'' 208 | table_entry += '{}' 209 | table_head += '<'+str(max_year-2000-nr_years)+'' 210 | table_head += '' 211 | table_entry += '{}' 212 | 213 | content = table_head 214 | 215 | areas = list(CONFERENCES.keys()) 216 | areas.append('sys') 217 | 218 | fig_tot = {} 219 | fig_max = {} 220 | fig_avg50 = {} 221 | fig_auth = {} 222 | 223 | for area in areas: 224 | ancient_total = 0 225 | fresh_total = 0 226 | for year in top_values[area]: 227 | if year < max_year-nr_years: 228 | ancient_total += top_values[area][year][0] 229 | else: 230 | fresh_total += top_values[area][year][0] 231 | row_tot = ['', AREA_TITLES[area], fresh_total+ancient_total] 232 | row_max = [' class="light"', '', 'max/a'] 233 | row_avg = [' class="light"', '', 'avg/a'] 234 | row_avg50a = [' class="light"', '', 'avg/a50'] 235 | row_auth = [' class="light"', '', '#a'] 236 | for year in range(max_year, max_year-nr_years, -1): 237 | if not year in top_values[area]: 238 | top_values[area][year] = ('', '', '', '', '') 239 | row_tot.append(top_values[area][year][0]) 240 | row_max.append(top_values[area][year][1]) 241 | row_avg.append(top_values[area][year][2]) 242 | row_avg50a.append(top_values[area][year][3]) 243 | row_auth.append(top_values[area][year][4]) 244 | row_tot.append(ancient_total) 245 | row_max.append('') 246 | row_avg.append('') 247 | row_avg50a.append('') 248 | row_auth.append('') 249 | content += table_entry.format(*row_tot) 250 | content += table_entry.format(*row_max) 251 | content += table_entry.format(*row_avg) 252 | content += table_entry.format(*row_avg50a) 253 | content += table_entry.format(*row_auth) 254 | for i in range(len(row_tot)): 255 | if row_tot[i] == '': 256 | row_tot[i] = 0 257 | if row_max[i] == '': 258 | row_max[i] = 0 259 | if row_avg50a[i] == '': 260 | row_avg50a[i] = 0 261 | if row_auth[i] == '': 262 | row_auth[i] = 0 263 | fig_tot[area] = row_tot[3:-1] 264 | fig_max[area] = row_max[3:-1] 265 | fig_avg50[area] = row_avg50a[3:-1] 266 | fig_auth[area] = row_auth[3:-1] 267 | stat_figure(fig_tot, 'Total number of publications per year', max_year, nr_years, fname='stat-tot.png') 268 | stat_figure(fig_max, 'Maximum number of publications of an author per year', max_year, nr_years, average=False, fname='stat-max.png') 269 | stat_figure(fig_avg50, 'Average number of publications per year for the top 50 authors', max_year, nr_years, average=False, fname='stat-avg50.png') 270 | stat_figure(fig_auth, 'Average number of active authors per year', max_year, nr_years, fname='stat-auth.png') 271 | 272 | stat_table = '''


273 |


274 |


275 |


276 | ''' 277 | return (content, stat_table) 278 | 279 | def stat_figure(fig_data, title, max_year, nr_years, average=True, fname=''): 280 | xaxis = [] 281 | for year in range(max_year, max_year-nr_years, -1): 282 | xaxis.append(year) 283 | plt.figure(figsize=(12, 5)) 284 | plt.title(title) 285 | plt.xticks(xaxis, xaxis) 286 | plt.xlabel('Year') 287 | for area in fig_data: 288 | lbl = area 289 | lwdt = 1.5 290 | if area == 'sys': 291 | if average: 292 | for i in range(len(fig_data[area])): 293 | fig_data[area][i] = fig_data[area][i]/(len(fig_data)-1) 294 | lbl = 'avg(sys)' 295 | else: 296 | lbl = 'all(sys)' 297 | lwdt = 4 298 | plt.plot(xaxis, fig_data[area], label=lbl, linewidth=lwdt) 299 | plt.legend() 300 | if fname == '': 301 | plt.show() 302 | else: 303 | plt.savefig('www/'+fname, bbox_inches="tight") 304 | plt.clf() 305 | 306 | 307 | if __name__ == '__main__': 308 | all_pubs = [] 309 | top_values = {} 310 | for area in CONFERENCES: 311 | # Load pickeled data 312 | with open('pickle/pubs-{}.pickle'.format(area), 'rb') as f: 313 | pubs = pickle.load(f) 314 | f.close() 315 | all_pubs += pubs 316 | 317 | # Prepare per-author information 318 | authors, _, top_values[area] = parse_authors(pubs) 319 | print('Analyzed a total of {} authors for {}'.format(len(authors), area)) 320 | 321 | # Pretty print HTML 322 | top_authors(authors, cons = ', '.join(CONFERENCES_SHORT[area]), title = AREA_TITLES[area], fname = 'www/top-authors-{}.html'.format(area)) 323 | 324 | # Prepare per-author information 325 | authors, max_year, top_values['sys'] = parse_authors(all_pubs) 326 | print('Analyzed a total of {} authors'.format(len(authors))) 327 | 328 | # Pretty print HTML 329 | allcons = [] 330 | for area in CONFERENCES: 331 | allcons = allcons + CONFERENCES_SHORT[area] 332 | 333 | # No researchers from Geneva, Basel, St. Gallen, or Fribourg 334 | affils = ['ETH Zurich', 'ETH Zürich', 'EPFL', 'Swiss Federal Institute of Technology in Lausanne', 'École Polytechnique Fédérale de Lausanne', 'Università della Svizzera italiana', 'University of Zurich', 'University of Bern'] 335 | filtered_authors = {} 336 | for author in authors: 337 | if authors[author].affiliation in affils: 338 | filtered_authors[author] = authors[author] 339 | 340 | top_authors(authors, cons = ', '.join(allcons), title = 'Systems (All Top Conferences)', fname = 'www/top-authors-sys.html') 341 | top_authors(filtered_authors, cons = ', '.join(allcons), title = 'Systems (All Top Conferences, CH)', fname = 'www/top-authors-sys-ch.html') 342 | 343 | content = '' 344 | for area in AREA_TITLES: 345 | content = content + '
  • ' + AREA_TITLES[area] + '
  • \n' 346 | 347 | template = open('templates/top-index.html', 'r').read() 348 | template = template.replace('XXXCONTENTXXX', content) 349 | stat_table, stat_img = stat_table(top_values, max_year) 350 | template = template.replace('XXXAREASTATSXXX', stat_table) 351 | template = template.replace('XXXAREAIMGSXXX', stat_img) 352 | template = template.replace('XXXDATEXXX', date.today().strftime("%Y-%m-%d")) 353 | fout = open('www/index.html', 'w') 354 | fout.write(template) --------------------------------------------------------------------------------