├── .gitignore ├── Makefile ├── README.md ├── cli.py ├── images ├── douban_movie_1.png ├── douban_movie_2.png └── douban_movie_3.png ├── movie ├── Country.csv ├── Movie.csv ├── Person.csv ├── actor.csv ├── composer.csv ├── director.csv ├── district.csv └── metadata.json ├── requirements.in ├── requirements.txt └── setup.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .pytest_cache/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # Mac automatic generation 8 | .idea 9 | .DS_Store 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Django stuff: 38 | *.log 39 | local_settings.py 40 | 41 | # Flask stuff: 42 | instance/ 43 | .webassets-cache 44 | 45 | # Scrapy stuff: 46 | .scrapy 47 | 48 | # Sphinx documentation 49 | docs/_build/ 50 | 51 | # PyBuilder 52 | target/ 53 | 54 | # IPython Notebook 55 | .ipynb_checkpoints 56 | 57 | # pyenv 58 | .python-version 59 | 60 | # dotenv 61 | .env 62 | 63 | # virtualenv 64 | env/ 65 | venv/ 66 | ENV/ 67 | 68 | # Spyder project settings 69 | .spyderproject 70 | 71 | 72 | # unittesting 73 | cobertura.xml 74 | testresult.xml 75 | .coverage 76 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | venv: 2 | - virtualenv --python=$(shell which python3) --prompt '' venv 3 | 4 | lock-requirements: 5 | - pip install pip-tools 6 | - pip-compile --output-file requirements.txt requirements.in 7 | 8 | deps: lock-requirements 9 | - pip install -U pip setuptools --quiet 10 | - pip install -r requirements.txt --quiet 11 | 12 | clean: 13 | - find . -iname "*__pycache__" | xargs rm -rf 14 | - find . -iname "*.pyc" | xargs rm -rf 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Knowledge Graph Examples 2 | ======================== 3 | 4 | ## Douban Movie Graph 5 | 6 | ### Neo4j 7 | 8 | Build graph 9 | 10 | ```shell 11 | python cli.py import-to-neo4j --url bolt://localhost:7687/ \ 12 | --auth "neo4j:myneo4j" \ 13 | --data-dir movie/ \ 14 | --batch-size 1000 \ 15 | --drop-all 16 | ``` 17 | 18 | 19 | - Example 1: 20 | 21 | ```cypher 22 | match (c:Country)-[]-(m:Movie)-[]-(p:Person) return * limit 100 23 | ``` 24 | 25 | Output: 26 | 27 | ![douban_movie_example1](./images/douban_movie_1.png) 28 | 29 | - Example 2: 30 | 31 | ```cypher 32 | MATCH (m:Movie)-[r:actor]->(p:Person) WHERE p.name="黄渤" RETURN * 33 | ``` 34 | 35 | Output 36 | 37 | ![douban_movie_example2](./images/douban_movie_2.png) 38 | 39 | - Example 3: 40 | 41 | ```cypher 42 | MATCH (a:Person), (b:Person), p=shortestpath((a)-[:actor*]-(b)) 43 | WHERE a.name="黄渤" and b.name="汤姆·克鲁斯" 44 | RETURN p 45 | ``` 46 | Output 47 | 48 | ![douban_movie_example3](./images/douban_movie_3.png) 49 | -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | import json 5 | import logging 6 | from glob import glob 7 | from logging.config import dictConfig 8 | 9 | import click 10 | import neo4j 11 | import requests 12 | 13 | 14 | dictConfig({ 15 | 'version': 1, 16 | 'formatters': { 17 | 'simple': { 18 | 'format': '%(asctime)s - %(filename)s:%(lineno)s: %(message)s', 19 | } 20 | }, 21 | 'handlers': { 22 | 'default': { 23 | 'level': 'INFO', 24 | 'class': 'logging.StreamHandler', 25 | 'formatter': 'simple', 26 | "stream": "ext://sys.stdout", 27 | }, 28 | }, 29 | 'loggers': { 30 | '': { 31 | 'handlers': ['default'], 32 | 'level': 'INFO', 33 | 'propagate': True 34 | } 35 | } 36 | }) 37 | logger = logging.getLogger('cli') 38 | 39 | 40 | def create_entity_index(neo4j_client, entity_type, property_name): 41 | with neo4j_client.session() as session: 42 | session.run(f"CREATE INDEX ON :{entity_type}({property_name})") 43 | logger.info( 44 | "created index on property '%s' of entity type `%s`", 45 | entity_type, property_name 46 | ) 47 | 48 | 49 | @click.group(context_settings=dict(help_option_names=['-h', '--help'])) 50 | def main(): 51 | pass 52 | 53 | 54 | @main.command("import-to-neo4j") 55 | @click.option("--url", default="bolt://localhost:7687/") 56 | @click.option("--auth", default="neo4j:myneo4j") 57 | @click.option("-d", "--data-dir", required=True) 58 | @click.option("-b", "--batch-size", type=int, default=1000) 59 | @click.option("--dropall", is_flag=True) 60 | def import_to_neo4j(url, auth, data_dir, batch_size, dropall): 61 | """导入数据到 Neo4j""" 62 | def convert_csv_row(csv_row): 63 | row = {} 64 | for header, value in csv_row.items(): 65 | key, *remain = header.split(':') 66 | if key: 67 | row[key] = value 68 | 69 | return row 70 | 71 | user, password = auth.split(':') 72 | client = neo4j.GraphDatabase.driver(url, auth=(user, password)) 73 | if dropall: 74 | with client.session() as session: 75 | session.run('MATCH (n) DETACH DELETE n') 76 | logger.info("Dropped all data in Neo4j server") 77 | 78 | metadata = None 79 | metadata_file = os.path.join(data_dir, "metadata.json") 80 | if not os.path.exists(metadata_file): 81 | logger.error("Cannot found 'metadata.json' in directory '%s'", data_dir) 82 | sys.exit(1) 83 | 84 | with open(metadata_file) as f: 85 | metadata = json.load(f) 86 | 87 | query_tmpl = 'UNWIND {values} as data create (:%s {%s})' 88 | for entity_type, entity_file in metadata["entity-data"].items(): 89 | create_entity_index(client, entity_type, "id") 90 | query = '' 91 | with open(os.path.join(data_dir, entity_file)) as f: 92 | entities, reader = [], csv.DictReader(f) 93 | for row in reader: 94 | entities.append(convert_csv_row(row)) 95 | if not query: 96 | query = query_tmpl % ( 97 | entity_type, 98 | ','.join([f'{prop}:data.{prop}' for prop in entities[-1]]) 99 | ) 100 | 101 | if len(entities) == batch_size: 102 | with client.session() as session: 103 | session.run(query, {'values': entities}) 104 | 105 | logger.info("wrote %d entities in Neo4j server", batch_size) 106 | entities = [] 107 | 108 | if entities: 109 | with client.session() as session: 110 | session.run(query, {'values': entities}) 111 | 112 | logger.info("wrote %d entities in Neo4j server", len(entities)) 113 | 114 | query_tmpl = ( 115 | 'UNWIND {values} as data ' 116 | 'MATCH (a:%s {id:data.start_id}) ' 117 | 'MATCH (b:%s {id:data.end_id}) ' 118 | 'CREATE (a)-[:`%s`]->(b)' 119 | ) 120 | for relation_type, relation_file in metadata.get("relation-data", {}).items(): 121 | start_type, relation, end_type = relation_type.split('|') 122 | query = query_tmpl % (start_type, end_type, relation) 123 | with open(os.path.join(data_dir, relation_file)) as f: 124 | relations, reader = [], csv.DictReader(f) 125 | for row in reader: 126 | relations.append({ 127 | "start_id": row[":START_ID"], 128 | "end_id": row[":END_ID"], 129 | }) 130 | if len(relations) == batch_size: 131 | with client.session() as session: 132 | session.run(query, {'values': relations}) 133 | 134 | logger.info("wrote %d relations in Neo4j server", batch_size) 135 | relations = [] 136 | 137 | if relations: 138 | with client.session() as session: 139 | session.run(query, {'values': relations}) 140 | 141 | logger.info("wrote %d relations in Neo4j server", len(relations)) 142 | 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /images/douban_movie_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linusp/kg-example/b3a851ed637e01c7b7023877b99c776843fc7073/images/douban_movie_1.png -------------------------------------------------------------------------------- /images/douban_movie_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linusp/kg-example/b3a851ed637e01c7b7023877b99c776843fc7073/images/douban_movie_2.png -------------------------------------------------------------------------------- /images/douban_movie_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Linusp/kg-example/b3a851ed637e01c7b7023877b99c776843fc7073/images/douban_movie_3.png -------------------------------------------------------------------------------- /movie/Country.csv: -------------------------------------------------------------------------------- 1 | "name","id:ID",":LABEL" 2 | "China_中国大陆","7fc2b2562e9d98efe2d09a55a27f9f13","Country" 3 | "United States of America_美国","13ad3107c64998a9128531822b7976b7","Country" 4 | "Italy_意大利","1fad328df48d832e0382031f36066fcf","Country" 5 | "Iceland_冰岛","366f5bcb109610eda07e536792ac413a","Country" 6 | "Japan_日本","f4305630b4e7042ea22223d8753b3da9","Country" 7 | "France_法国","408aca7ceebe0dba7fb2d786b5bd9623","Country" 8 | "United Kingdom_英国","91555ed6368341bd97f97a2d57c82583","Country" 9 | "South Korea_韩国","d053bb9a1fda259db265f41f77e33d76","Country" 10 | "Mexico_墨西哥","7fd6a20b14fd1433023e483dcbe0720c","Country" 11 | "China_香港","595a5c8d5ec6a10417245c8fef169601","Country" 12 | "Australia_澳大利亚","4c7075111ad3ed1dbaa0058f37868120","Country" 13 | "Germany_德国","c4eeabd06fbf951284036ebe063722c5","Country" 14 | "Belgium_比利时","cd66146336fa6f25a68e971bf952a963","Country" 15 | "India_印度","c64316b019b7180c96d364b0a06f8557","Country" 16 | "China_台湾","3ad3cc769448c856df0c7f6a57a576a3","Country" 17 | "South Africa_南非","34f523d8b129f0946dcc5935b363f937","Country" 18 | "Canada_加拿大","353ed122a4f013e84a170f3c927c7e03","Country" 19 | "New Zealand_新西兰","88cd709b60c818400c5db8822c0d6b3f","Country" 20 | "Denmark_丹麦","634363fba0655a51616c851b18e64b53","Country" 21 | "Malaysia_马来西亚","4af33c50d90badf3c7d254106d08ad1b","Country" 22 | "Russia_俄罗斯","b86adccacadf0662368ffda5e17d2b1e","Country" 23 | "Spain_西班牙","f0b64d75d69b1f5cc313612b81b02833","Country" 24 | "Estonia_爱沙尼亚","214496a51831759d1dd3cc187db9ba0d","Country" 25 | "Poland_波兰","a540f2f518f238afdd92a1fd18f2e560","Country" 26 | "Ukraine_乌克兰","df3179423fb0582637118989a1cdfa87","Country" 27 | "Brazil_巴西","aaa9647b7d3422904a65c332bd0608db","Country" 28 | "Turkey_土耳其","ce114a06c3815927c11ebd704d6cd949","Country" 29 | "Bulgaria_保加利亚","377f6489b166e81528dae518f01b902a","Country" 30 | "Czech Republic_捷克","620b4a502ecfd4d0ae29eb415e1376f2","Country" 31 | "Thailand_泰国","1c5e6a1ec87338f28c86b943dc62d7b0","Country" 32 | "Hungary_匈牙利","9dc307e681dfa70d4904499a68bfeb90","Country" 33 | "Netherlands_荷兰","992b906c919a3045e83885f45b808b44","Country" 34 | "Iran_伊朗","77dd5183456a56014dda906b7975701f","Country" 35 | "Ireland_爱尔兰","22e2494f8ccb36eb804c3124cea6c8fe","Country" 36 | "Austria_奥地利","6bf8c1118d218cf823daa03b070a890b","Country" 37 | "Georgia_格鲁吉亚","a7be27d200b8133935b802b3fb9425ed","Country" 38 | "Kazakhstan_哈萨克斯坦","c4428254efafb170c59e1b3425658236","Country" 39 | "Sweden_瑞典","c392af313929f8ebce85f13204ada9ab","Country" 40 | "Israel_以色列","8b9cd04de1c78d11b670ae5a5fc52fd2","Country" 41 | "Mauritania_毛里塔尼亚","cbce01ff315a31ac4d1beefa6444e7cc","Country" 42 | "Norway_挪威","2937211cb5f223051be211f846b30d30","Country" 43 | "Denmark_丹麦 Denmark","210b67ef04e5b203e879a5d3d35b2a12","Country" 44 | "Switzerland_瑞士","306751770e59b2868e3737a36fdf3631","Country" 45 | "Luxembourg_卢森堡","5bf92e3a3ca7597ed63762432138d668","Country" 46 | "Argentina_阿根廷","52059ed5efa83bfb77ae08f16ccf26a9","Country" 47 | "Lithuania_立陶宛","6c15117286728700bee75631ed4974bc","Country" 48 | "Myanmar_缅甸","23cdc190d8494816232409bb28dece11","Country" 49 | "Greece_希腊","0640a02469377e4b0f00e11266265e8a","Country" 50 | "Australia_澳大利亚 Australia","52c44be5712e9092fef3ba0a5436ed2d","Country" 51 | "Tunisia_突尼斯","9cd1c0b80cccfc24ffa57f53d13bd922","Country" 52 | "Botswana_博茨瓦纳","2ed8485a0a27a8d9bdcf754ec85c4cb5","Country" 53 | "Germany_西德","ca0d3ebdb9ce249b4f61593bc466b2af","Country" 54 | "China_中国","512c03bee5b71d5a8bf402a565121a3e","Country" 55 | "Colombia_哥伦比亚","12a7b3a231f6ab4225293d40f72d5754","Country" 56 | "Finland_芬兰","d7126aa5580c8b0456945277dae709c5","Country" 57 | "Latvia_拉脱维亚","80e699030b672bbaf49462228e290c33","Country" 58 | "United Arab Emirates_阿联酋","10277873a792d5cb7d1a9cd2c0b9a8b9","Country" 59 | "Vietnam_越南","6c27eb044f452fc727b752460227e428","Country" 60 | "Morocco_摩洛哥","c9b92a4f7211342749cca170a797680b","Country" 61 | "Russia_苏联","b626d6f734d5ef7168dd8f1bb0a784c2","Country" 62 | "Cuba_古巴","2ebf6a376d3b75a8fdf5e7d29a14071a","Country" 63 | "Chile_智利","3edfec7a296784c88bb7dddcfc922846","Country" 64 | "Germany_原西德","04e0f67ac1e2194015e69754e53b9229","Country" 65 | "Slovakia_捷克斯洛伐克","4224044d1891fdd2a7b85cba2836fd72","Country" 66 | "Portugal_葡萄牙","2a6e0ff94bafacfd2cb8dce95fa76fcb","Country" 67 | "Slovakia_斯洛伐克","fe9eaec3056037975fa7b710dc1ca567","Country" 68 | "Algeria_阿尔及尼亚","08a22df2eb905a50d21937b12f2c9959","Country" 69 | "Tajikistan_塔吉克斯坦","92ec190710aa4e35b41d800536d8e084","Country" 70 | "Uzbekistan_乌兹别克斯坦","12efe81308c79aef4a0315076f89fc18","Country" 71 | "Slovakia_捷克斯洛伐克 Czechoslovakia","35a0be8e14a547774ba83cdf4652ee0a","Country" 72 | "Romania_罗马尼亚","a640cd5f70acdb4e8a6092abb3eec0b5","Country" 73 | "Iceland_冰島 Iceland","351846bfccac87b5eae3b288e54a727f","Country" 74 | "Canada_加拿大 Canada","4857888cf5421e1b01fa1237870b2eb9","Country" 75 | "United States of America_USA","e25a690d9e9c083ecf7f26517d05324c","Country" 76 | "United Kingdom_UK","b4f35124b12f21728bee305719b15e2d","Country" 77 | "Philippines_菲律宾","ad5e7dc8cd49f7efa9d1cb06e181c581","Country" 78 | "Czech Republic_捷克 Czech Republic","a39acb6653978379a11791320f3d2b43","Country" 79 | "The Bahamas_巴哈马 Bahamas","767d7db6411a9bb459ffd764228d60c0","Country" 80 | "Indonesia_印度尼西亚","fdf65cd14ed9fbc522853eb1ac01f6a8","Country" 81 | "Egypt_埃及","94cec28ec9221c822fddf518d8f2e925","Country" 82 | "Puerto Rico_波多黎各","4064de8ddef6639e973b75aec418f9b3","Country" 83 | "Indonesia_印尼","8b92039adac6f65f1741700a96ed7681","Country" 84 | "Canada_Canada","976ad072b1e174d173d14e0ac0061b7f","Country" 85 | "Peru_秘鲁","46c9293d35be0c8714f6c3f8379f9255","Country" 86 | -------------------------------------------------------------------------------- /movie/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity-data": { 3 | "Person": "Person.csv", 4 | "Movie": "Movie.csv", 5 | "Country": "Country.csv" 6 | }, 7 | "relation-data": { 8 | "Movie|actor|Person": "actor.csv", 9 | "Movie|composer|Person": "composer.csv", 10 | "Movie|director|Person": "director.csv", 11 | "Movie|district|Country": "district.csv" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | click==7.0 2 | neo4j==1.7.2 3 | requests==2.32.0 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | certifi==2024.7.4 8 | # via requests 9 | charset-normalizer==3.1.0 10 | # via requests 11 | click==7.0 12 | # via -r requirements.in 13 | idna==3.7 14 | # via requests 15 | neo4j==1.7.2 16 | # via -r requirements.in 17 | neobolt==1.7.4 18 | # via neo4j 19 | neotime==1.7.4 20 | # via neo4j 21 | pytz==2019.1 22 | # via neotime 23 | requests==2.32.0 24 | # via -r requirements.in 25 | six==1.12.0 26 | # via neotime 27 | urllib3==1.26.18 28 | # via requests 29 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | ignore = E201,E202 4 | 5 | [pep8] 6 | max-line-length = 100 7 | ignore = E201,E202 8 | --------------------------------------------------------------------------------