├── .gitignore ├── Makefile ├── README.md ├── author_cliques.py ├── check_author.py ├── docs └── index.html ├── parse_dblp.py ├── pubs.py ├── templates ├── cliques.html ├── top-authors.html └── top-index.html └── top_authors.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Downloaded files 2 | dblp.xml.gz 3 | dblp.dtd 4 | dblp-aliases.csv 5 | csrankings.csv 6 | 7 | # Created files 8 | pickle/* 9 | www/* 10 | __pycache__/* 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY=process topauthors cliques fresh all deploy 2 | 3 | all: fresh process 4 | # updated data and process new data 5 | 6 | process: topauthors cliques 7 | 8 | topauthors: 9 | python3 top_authors.py 10 | 11 | cliques: 12 | python3 author_cliques.py 13 | 14 | fresh: 15 | # freshen raw data files (checking timestamps) 16 | wget -N https://dblp.uni-trier.de/xml/dblp.xml.gz 17 | wget -N https://dblp.uni-trier.de/xml/dblp.dtd 18 | #wget -N https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/dblp-aliases.csv 19 | wget -N https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/csrankings.csv 20 | mkdir -p pickle 21 | mkdir -p www 22 | # get the pickling started 23 | python3 parse_dblp.py 24 | 25 | deploy: 26 | for i in www/*.html; do \ 27 | echo $$i ; \ 28 | gzip -f -9 -k $$i ; \ 29 | done 30 | unison -prefer=newer -batch www/ ssh://ghul.albtraum.org/pubstats/ 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Publication statistics 2 | 3 | This repository establishes simple statistics for a set of conferences. 4 | 5 | Using the DBLP data set, we extract the top conferences and then aggregate them 6 | on per-author basis. Based on different sub groups (e.g., security, embedded 7 | systems, or OS) we then calculate per author statistics in a nice overview. 8 | 9 | Processing happens in two stages: 10 | 11 | * `parse_dblp.py` extracts all publications and dumps them in a pickle files 12 | based on the per-area aggregation (this is slow as DBLP is a 3GB XML file). 13 | To be able to process such a large XML file, we use a stream processor that 14 | simply dumps interesting publications into `Pub` objects (see `pubs.py`). 15 | * `top_authors.py` leverages the pickle files to process per-area statistics 16 | and aggregate statistics. 17 | * `author_cliques` leverages the pickle files to calculate per-area author 18 | * cliques. 19 | 20 | 21 | ## Using/Howto 22 | 23 | * Easy mode: check out the [homepage](https://hexhive.epfl.ch/pubstats/) 24 | * `make all` to download DBLP data, pickle, and create the html data 25 | * `make fresh` to update DBLP data and pickle it 26 | * `make topauthors` to create the top author pages 27 | * `make cliques` to create the cliques 28 | 29 | 30 | ## Contributing 31 | 32 | Ideas, comments, or improvements are welcome! Please reach out to 33 | [Mathias Payer](mailto:mathias.payer@nebelwelt.net) to discuss. You can also 34 | reach out to [@gannimo on Twitter](https://www.twitter.com/gannimo). 35 | 36 | 37 | ## Changelog 38 | 39 | * 2023-08-21 random bugfixes and conference updates 40 | * 2023-02-06 adjusted SE/DB conferences based on feedback 41 | * 2021-02-09 fixed VLDB conference and added ICDE and PODS for the database 42 | community; added ASE and ISSTA for the software engineering community 43 | * 2021-01-11 added HPCA for architecture and adjusted paper length calculation for DAC 44 | * 2021-01-09 remove tutorials and short papers (by parsing pages data) 45 | * 2021-01-05 figures for overview page 46 | * 2021-01-04 new overview table across areas 47 | * 2021-01-02 added author cliques 48 | * 2020-12-30 first version with author statistics 49 | 50 | 51 | ## Acknowledgements 52 | 53 | This code and page was developed by [Mathias Payer](https://nebelwelt.net), 54 | initially over the 2020 holiday break. The site includes feedback and 55 | suggestions from too many to list, thank you for that! 56 | 57 | We use information from [DBLP](https://dblp.org/xml/) and 58 | [CSRankings](https://raw.githubusercontent.com/emeryberger/CSrankings/gh-pages/dblp-aliases.csv) 59 | for anti-aliasing of authors. The idea for the statistics was inspired by 60 | [Davide's Software Security Circus](http://s3.eurecom.fr/~balzarot/notes/top4_2019/). 61 | 62 | 63 | ## License 64 | 65 | All data in this repository is licensed under 66 | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/). 67 | -------------------------------------------------------------------------------- /author_cliques.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import pickle 4 | import networkx as nx 5 | import matplotlib.pyplot as plt 6 | from math import ceil, sqrt 7 | from datetime import date 8 | 9 | from pubs import Pub, Author, CONFERENCES, AREA_TITLES 10 | 11 | def parse_author_cliques(pubs): 12 | authors = {} 13 | for pub in pubs: 14 | # create all author connections 15 | # iterate through each paper and count author1 -> author2 edges 16 | # we store the collected data in a hashmap of author1 -> author2 -> count 17 | for first_name in range(len(pub.authors)): 18 | for second_name in range(first_name + 1, len(pub.authors)): 19 | first_idx = first_name 20 | second_idx = second_name 21 | if pub.authors[first_name] > pub.authors[second_name]: 22 | second_idx = first_name 23 | first_idx = second_name 24 | if not pub.authors[first_idx] in authors: 25 | authors[pub.authors[first_idx]] = {} 26 | if not pub.authors[second_idx] in authors[pub.authors[first_idx]]: 27 | authors[pub.authors[first_idx]][pub.authors[second_idx]] = 0 28 | authors[pub.authors[first_idx]][pub.authors[second_idx]] += 1 29 | return authors 30 | 31 | def parse_graph(authors, num_edges = 10, fname = ''): 32 | G = nx.Graph() 33 | #plt.figure(num=None, figsize=(20, 20), dpi=100) 34 | #plt.axis('off') 35 | #fig = plt.figure(1) 36 | 37 | author_set = [] 38 | # count all authors with at least num_edges edges: 39 | for author1 in authors: 40 | for author2 in authors[author1]: 41 | if authors[author1][author2] >= num_edges: 42 | if author1 not in author_set: 43 | author_set.append(author1) 44 | G.add_node(author1) 45 | if author2 not in author_set: 46 | author_set.append(author2) 47 | G.add_node(author2) 48 | G.add_edge(author1, author2, weight=authors[author1][author2]) 49 | # break large graph into subgraphs (for disconnected parts) 50 | sub_graphs = list(G.subgraph(c) for c in nx.connected_components(G)) 51 | # if, insted of sub graphs, we want one graph for all cliques, use the following code: 52 | ##pos=nx.spring_layout(G, k=5, scale=9, iterations=500) 53 | ##nx.draw_networkx_nodes(G, pos) 54 | ##nx.draw_networkx_labels(G, pos) 55 | ##nx.draw_networkx_edges(G, pos) 56 | nr_cliques = len(sub_graphs) 57 | print('Found {} cliques'.format(nr_cliques)) 58 | x_cliques = ceil(sqrt(nr_cliques)) 59 | y_cliques = ceil(nr_cliques / x_cliques) 60 | fig, axes = plt.subplots(nrows=x_cliques, ncols=y_cliques, figsize=(20,20)) 61 | #fig.suptitle(fname) 62 | if (len(sub_graphs) == 1): 63 | ax = [axes] 64 | else: 65 | ax = axes.flatten() 66 | for i in range(len(sub_graphs)): 67 | # create a graph layout based on feedback for each sub graph 68 | pos=nx.spring_layout(sub_graphs[i], k=0.1, scale=0.6, iterations=80) 69 | nx.draw_networkx_nodes(sub_graphs[i], pos, ax=ax[i]) 70 | nx.draw_networkx_labels(sub_graphs[i], pos, ax=ax[i]) 71 | nx.draw_networkx_edges(sub_graphs[i], pos, ax=ax[i]) 72 | # adjust borders for sub graph (because we have long lables that spill) 73 | xmin, xmax, ymin, ymax = ax[i].axis() 74 | ax[i].set(xlim = (xmin-0.5, xmax+0.5), ylim=(ymin, ymax)) 75 | ax[i].set_axis_off() 76 | # disable axes for blank subplots (if we have remaining space) 77 | for i in range(len(sub_graphs), x_cliques*y_cliques): 78 | ax[i].set_axis_off() 79 | fig.tight_layout() 80 | 81 | # adjust borders for single graph case 82 | #l,r = plt.xlim() 83 | #plt.xlim(l-1, r+1) 84 | #t, b = plt.ylim() 85 | #plt.ylim(t-1, b+1) 86 | plt.savefig('www/'+fname, bbox_inches="tight") 87 | #plt.show() 88 | del fig 89 | 90 | if __name__ == '__main__': 91 | all_pubs = [] 92 | AREAPUBS = 10 93 | ALLPUBS = 20 94 | for area in CONFERENCES: 95 | # Load pickeled data 96 | with open('pickle/pubs-{}.pickle'.format(area), 'rb') as f: 97 | pubs = pickle.load(f) 98 | f.close() 99 | all_pubs += pubs 100 | 101 | # Prepare per-author information 102 | authors = parse_author_cliques(pubs) 103 | print('Analyzed a total of {} authors'.format(len(authors))) 104 | 105 | # Create and draw graph 106 | parse_graph(authors, num_edges=AREAPUBS, fname=area) 107 | 108 | # Prepare per-author information 109 | authors = parse_author_cliques(all_pubs) 110 | print('Analyzed a total of {} authors'.format(len(authors))) 111 | 112 | # Create and draw graph 113 | parse_graph(authors, num_edges=ALLPUBS, fname='all') 114 | 115 | content = '