├── extracted └── .keep ├── .gitignore ├── import.sh ├── utils.py ├── README.md └── to_csv.py /extracted/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | extracted/*/ 2 | *.pyc 3 | csvs/ 4 | stackexchange/ 5 | -------------------------------------------------------------------------------- /import.sh: -------------------------------------------------------------------------------- 1 | rm -rf ../neo/data/graph.db 2 | ../neo/bin/neo4j-import \ 3 | --into ../neo/data/graph.db \ 4 | --id-type string \ 5 | --nodes:Post csvs/posts.csv \ 6 | --nodes:User csvs/users.csv \ 7 | --nodes:Tag csvs/tags.csv \ 8 | --relationships:PARENT_OF csvs/posts_rel.csv \ 9 | --relationships:HAS_TAG csvs/tags_posts_rel.csv \ 10 | --relationships:POSTED csvs/users_posts_rel.csv 11 | ../neo/bin/neo4j restart -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | REMOVE_ALL = """ 2 | MATCH (n) 3 | OPTIONAL MATCH (n)-[r]-() 4 | DELETE n,r 5 | """ 6 | 7 | class Unbuffered(object): 8 | def __init__(self, stream): 9 | self.stream = stream 10 | def write(self, data): 11 | self.stream.write(data) 12 | self.stream.flush() 13 | def __getattr__(self, attr): 14 | return getattr(self.stream, attr) 15 | 16 | import sys 17 | sys.stdout = Unbuffered(sys.stdout) 18 | 19 | def replace_keys(row): 20 | new = {} 21 | for key,val in row.items(): 22 | new[key.lower().replace('@','')] = val 23 | return new -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Import stackexchange in neo4j 2 | 3 | steps: 4 | 5 | - Download the dump from archive.org: https://archive.org/details/stackexchange 6 | - extract the community you want in `extracted//` with `Posts.xml` & co. in the dir 7 | - you can `dtrx` on linux 8 | - you need to `sudo pip3 install xmltodict` 9 | - `python3 to_csv.py extracted/` to get the csvs in `csvs/` 10 | - `sh import.sh` to import the csvs in neo4j 11 | - assuming that neo4j is in the `../neo/` directory 12 | - **the script assume that you want to remove you old database** (at the end) 13 | 14 | Look at the scripts before using them to understand what they do :) 15 | 16 | *Have fun!* 17 | -------------------------------------------------------------------------------- /to_csv.py: -------------------------------------------------------------------------------- 1 | import json, sys, os, xmltodict, csv 2 | from os.path import join 3 | from utils import * 4 | import shutil 5 | 6 | PATH = sys.argv[1] 7 | DIR = PATH.replace('extracted/','') 8 | 9 | print("importing",DIR) 10 | 11 | file = join(PATH,'Posts.xml') 12 | 13 | def clean(x): 14 | #neo4j-import doesn't support: multiline (coming soon), quotes next to each other and escape quotes with '\""' 15 | return x.replace('\n','').replace('\r','').replace('\\','').replace('"','') 16 | 17 | def open_csv(name): 18 | return csv.writer(open('csvs/{}.csv'.format(name), 'w'), doublequote=False, escapechar='\\') 19 | 20 | try: 21 | shutil.rmtree('csvs/') 22 | except: 23 | pass 24 | os.mkdir('csvs') 25 | 26 | posts = open_csv('posts') 27 | posts_rel = open_csv('posts_rel') 28 | users = open_csv('users') 29 | users_posts_rel = open_csv('users_posts_rel') 30 | tags = open_csv('tags') 31 | tags_posts_rel = open_csv('tags_posts_rel') 32 | 33 | posts.writerow(['postId:ID(Post)', 'title', 'body','score','views','comments']) 34 | posts_rel.writerow([':START_ID(Post)', ':END_ID(Post)']) 35 | 36 | users_things = ['displayname', 'reputation', 'aboutme', \ 37 | 'websiteurl', 'location', 'profileimageurl', 'views', 'upvotes', 'downvotes'] 38 | users.writerow(['userId:ID(User)'] + users_things) 39 | users_posts_rel.writerow([':START_ID(User)', ':END_ID(Post)']) 40 | 41 | tags.writerow(['tagId:ID(Tag)']) 42 | tags_posts_rel.writerow([':START_ID(Post)', ':END_ID(Tag)']) 43 | 44 | for i, line in enumerate(open(file)): 45 | line = line.strip() 46 | try: 47 | if line.startswith("')] 64 | for tag in [x for x in eltags if x]: 65 | tags_posts_rel.writerow([el['id'],tag]) 66 | except Exception as e: 67 | print('x',e) 68 | if i and i % 5000 == 0: 69 | print('.',end='') 70 | if i and i % 1000000 == 0: 71 | print(i) 72 | 73 | print(i,'posts ok') 74 | 75 | file = join(PATH,'Users.xml') 76 | 77 | for i, line in enumerate(open(file)): 78 | line = line.strip() 79 | try: 80 | if line.startswith("