├── .gitignore
├── NOTICE.txt
├── batch.properties
├── changelog.txt
├── generate.sh
├── import-mvn.sh
├── import.bat
├── import.sh
├── import_csv.sh
├── pom.xml
├── readme.md
├── run.sh
├── sample
├── batch.properties
├── import.sh
├── nodes.csv
├── nodes2.csv
└── rels.csv
├── settings.sh
├── sort.sh
└── src
├── main
├── java
│ └── org
│ │ └── neo4j
│ │ └── batchimport
│ │ ├── CSVParser.java
│ │ ├── CSVReader.java
│ │ ├── Importer.java
│ │ ├── IndexInfo.java
│ │ ├── LineData.java
│ │ ├── Report.java
│ │ ├── StdOutReport.java
│ │ ├── Utils.java
│ │ ├── importer
│ │ ├── AbstractLineData.java
│ │ ├── ChunkerLineData.java
│ │ ├── CsvLineData.java
│ │ ├── RelType.java
│ │ ├── RowData.java
│ │ └── Type.java
│ │ ├── index
│ │ ├── LongIterableIndexHits.java
│ │ └── MapDbCachingIndexProvider.java
│ │ └── utils
│ │ ├── Chunker.java
│ │ ├── Config.java
│ │ ├── FileIterator.java
│ │ ├── Params.java
│ │ ├── RelationshipSorter.java
│ │ └── RelationshipSorter2.java
└── resources
│ └── log4j.properties
└── test
└── java
├── DataTest.java
└── org
└── neo4j
└── batchimport
├── ImporterIntegrationTest.java
├── ImporterTest.java
├── IndexInfoTest.java
├── RelationshipMatcher.java
├── TestDataGenerator.java
├── TestImporter.java
├── csv
├── ChunkerPerformanceTest.java
├── ChunkerRowDataTest.java
├── CsvLineDataTest.java
├── OpenCSVPerformanceTest.java
├── OpenCSVTest.java
├── PerformanceTestFile.java
├── RowDataPerformanceTest.java
├── RowDataTest.java
└── StreamTokenizerTest.java
├── importer
└── AbstractLineDataTest.java
└── utils
├── ChunkerTest.java
├── ConfigTest.java
├── FileIteratorTest.java
├── ParamsTest.java
├── RelStartEndComparatorTest.java
└── RelationshipSorterTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | zip.sh
2 | *.db/
3 | lib/
4 | *.tsv
5 | .project
6 | .shell_history
7 | *.ipr
8 | *.iws
9 | *.iml
10 | .idea
11 | target
12 | *.csv
13 | .DS_Store
14 | .settings
15 | *.gz
16 | gc.log
17 | *.zip
18 | zip19.sh
--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Neo4j
2 | Copyright © 2002-2011 Network Engine for Objects in Lund AB (referred to
3 | in this notice as “Neo Technology”)
4 | [http://neotechnology.com]
5 |
6 | This product includes software ("Software") developed by Neo Technology.
7 |
8 | The copyright in the bundled Neo4j graph database (including the
9 | Software) is owned by Neo Technology. The Software developed and owned
10 | by Neo Technology is licensed under the GNU GENERAL PUBLIC LICENSE
11 | Version 3 (http://www.fsf.org/licensing/licenses/gpl-3.0.html) ("GPL")
12 | to all third parties and that license, as required by the GPL, is
13 | included in the LICENSE.txt file.
14 |
15 | However, if you have executed an End User Software License and Services
16 | Agreement or an OEM Software License and Support Services Agreement, or
17 | another commercial license agreement with Neo Technology or one of its
18 | affiliates (each, a "Commercial Agreement"), the terms of the license in
19 | such Commercial Agreement will supersede the GPL and you may use the
20 | software solely pursuant to the terms of the relevant Commercial
21 | Agreement.
22 |
23 |
24 | Third party libraries
25 | ---------------------
26 |
27 | Full license texts are found in LICENSES.txt.
28 |
29 |
30 | The bundled JAX-RS (JSR311) API is licensed under the GNU General
31 | Public License Version 2 with classpath exception. Alternatively
32 | under the Common Development and Distribution License, version 1.1.
33 |
34 | The bundled Jersey library, containing Jersey Core, Jersey Server, and
35 | Jersey Client, is licensed under the GNU General Public License
36 | Version 2 with classpath exception. Alternatively under the Common
37 | Development and Distribution License, version 1.1.
38 |
39 | The Mime streaming plugin library is licensed under the GNU General
40 | Public License Version 2 with classpath exception. Alternatively
41 | under the Common Development and Distribution License, version 1.0.
42 |
43 | Other bundled libraries are licenced according to the following listing.
44 |
45 | The Apache Software License, Version 2.0:
46 | Apache ServiceMix :: Bundles :: lucene,
47 | Apache Commons:
48 | Commons BeanUtils, Commons BeanUtils Core, Commons Collections, Commons IO,
49 | Commons Configuration, Commons Digester, Commons Lang, Commons Logging,
50 | Apache Log4j,
51 | Apache Felix: Felix FileInstall, Felix Framework, Felix Main,
52 | JSON.simple,
53 | RRD4J,
54 | Geronimo Java Transaction API,
55 | Groovy,
56 | Jackson: Jackson Core, Jackson JAX-RS, Data Mapper for Jackson,
57 | Jansi,
58 | Jetty: Jetty, Jetty Util, Jetty Servlet Specification API,
59 |
60 | MIT License:
61 | SLF4J API Module, SLF4J Log4j-12 Binding, SLF4J JDK1.4 Logging Binding,
62 | SLF4J Jakarta Commons Logging Binding,
63 | Base64.js,
64 | jTemplates,
65 | jQuery,
66 | jQuery BBQ,
67 | jQuery hashchange event,
68 | SimpleModal,
69 | jQuery flot including colorhelpers
70 |
71 | BSD licence:
72 | ASM: ASM Core, ASM Tree, ASM Commons, ASM Util, ASM Analysis,
73 | Blueprints: Data Models and their Implementations,
74 | Gremlin: A Graph-Based Programming Language,
75 | Pipes: A Data Flow Framework using Process Graphs,
76 | JLine, Scala library
77 |
78 | provided without support or warranty: JSON (JavaScript Object Notation)
79 |
80 | Public domain:
81 | Dough Lea's util.concurrent package,
82 | ANTLR 2.7.7,
83 | JSON2.js
84 |
--------------------------------------------------------------------------------
/batch.properties:
--------------------------------------------------------------------------------
1 | dump_configuration=false
2 | cache_type=none
3 | use_memory_mapped_buffers=true
4 | neostore.propertystore.db.index.keys.mapped_memory=5M
5 | neostore.propertystore.db.index.mapped_memory=5M
6 | neostore.nodestore.db.mapped_memory=200M
7 | neostore.relationshipstore.db.mapped_memory=500M
8 | neostore.propertystore.db.mapped_memory=200M
9 | neostore.propertystore.db.strings.mapped_memory=200M
10 | batch_array_separator=,
11 | #batch_import.csv.quotes=true
12 | #batch_import.csv.delim=,
13 |
--------------------------------------------------------------------------------
/changelog.txt:
--------------------------------------------------------------------------------
1 | 2013-06-27
2 | ==========
3 | * supports array types
4 |
5 | 2013-06-19
6 | ==========
7 | * import files can be now compressed as .gz or .zip
8 | * supports multiple csv files for nodes, relationships, comma separated
9 | * supports automatic indexing with headers like "name:string:users"
10 | * supports index lookups for relationships for start/end fields "name:string:users" and the litaral values
11 | * supports now config file based setup
12 | * supports keeping the database instead of cleaning
13 | * supports opencsv as reader, alternative delimiters, quotes
14 | * supports caching in index lookups using MapDB in front of lucene
15 | * added faster default CSV reader
16 | * added new type LABEL that will also be used for node-labels in the future
--------------------------------------------------------------------------------
/generate.sh:
--------------------------------------------------------------------------------
1 | source ./settings.sh
2 |
3 | mvn clean test-compile exec:java -Dexec.mainClass=org.neo4j.batchimport.TestDataGenerator -Dexec.classpathScope=test \
4 | -Dexec.args="$1 $2 $3 $4" | grep -iv '\[\(INFO\|debug\)\]'
5 |
--------------------------------------------------------------------------------
/import-mvn.sh:
--------------------------------------------------------------------------------
1 | DB=${1-target/graph.db}
2 | shift
3 | NODES=${1-nodes.csv}
4 | shift
5 | RELS=${1-rels.csv}
6 | shift
7 | mvn compile exec:java -Dexec.mainClass="org.neo4j.batchimport.Importer" \
8 | -Dexec.args="batch.properties $DB $NODES $RELS $*" | grep -iv '\[\(INFO\|debug\)\]'
9 |
--------------------------------------------------------------------------------
/import.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | set ERROR_CODE=0
4 | set HEAP=4G
5 |
6 | :init
7 | @REM Decide how to startup depending on the version of windows
8 |
9 | @REM -- Win98ME
10 | if NOT "%OS%"=="Windows_NT" goto Win9xArg
11 |
12 | @REM set local scope for the variables with windows NT shell
13 | if "%OS%"=="Windows_NT" @setlocal
14 |
15 | @REM -- 4NT shell
16 | if "%eval[2+2]" == "4" goto 4NTArgs
17 |
18 | @REM -- Regular WinNT shell
19 | set CMD_LINE_ARGS=%*
20 | goto WinNTGetScriptDir
21 |
22 | @REM The 4NT Shell from jp software
23 | :4NTArgs
24 | set CMD_LINE_ARGS=%$
25 | goto WinNTGetScriptDir
26 |
27 | :Win9xArg
28 | @REM Slurp the command line arguments. This loop allows for an unlimited number
29 | @REM of agruments (up to the command line limit, anyway).
30 | set CMD_LINE_ARGS=
31 | :Win9xApp
32 | if %1a==a goto Win9xGetScriptDir
33 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
34 | shift
35 | goto Win9xApp
36 |
37 | :Win9xGetScriptDir
38 | set SAVEDIR=%CD%
39 | %0\
40 | cd %0\..\..
41 | set BASEDIR=%CD%
42 | cd %SAVEDIR%
43 | set SAVE_DIR=
44 | goto repoSetup
45 |
46 | :WinNTGetScriptDir
47 | set BASEDIR=%~dp0\.
48 |
49 | :repoSetup
50 |
51 | if "%JAVACMD%"=="" set JAVACMD=java
52 |
53 | if "%REPO%"=="" set REPO=%BASEDIR%\lib
54 |
55 | rem Setup the classpath
56 | set LIBPATH=""
57 | pushd "%REPO%"
58 | for %%G in (*.jar) do call:APPEND_TO_LIBPATH %%G
59 | popd
60 | goto LIBPATH_END
61 |
62 | : APPEND_TO_LIBPATH
63 | set filename=%~1
64 | set suffix=%filename:~-4%
65 | if %suffix% equ .jar set LIBPATH=%LIBPATH%;"%REPO%\%filename%"
66 | goto :EOF
67 |
68 | :LIBPATH_END
69 |
70 | set CLASSPATH=%LIBPATH%
71 |
72 | set EXTRA_JVM_ARGUMENTS=-Dfile.encoding=UTF-8 -Xmx%HEAP% -Xms%HEAP%
73 | goto endInit
74 |
75 | @REM Reaching here means variables are defined and arguments have been captured
76 | :endInit
77 |
78 | %JAVACMD% %JAVA_OPTS% %EXTRA_JVM_ARGUMENTS% -classpath %CLASSPATH_PREFIX%;%CLASSPATH% -Dapp.name="batch-import" -Dapp.repo="%REPO%" -Dbasedir="%BASEDIR%" org.neo4j.batchimport.Importer %CMD_LINE_ARGS%
79 | if ERRORLEVEL 1 goto error
80 | goto end
81 |
82 | :error
83 | if "%OS%"=="Windows_NT" @endlocal
84 | set ERROR_CODE=1
85 |
86 | :end
87 | @REM set local scope for the variables with windows NT shell
88 | if "%OS%"=="Windows_NT" goto endNT
89 |
90 | @REM For old DOS remove the set variables from ENV - we assume they were not set
91 | @REM before we started - at least we don't leave any baggage around
92 | set CMD_LINE_ARGS=
93 | goto postExec
94 |
95 | :endNT
96 | @endlocal
97 |
98 | :postExec
99 |
100 | if "%FORCE_EXIT_ON_ERROR%" == "on" (
101 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
102 | )
103 |
104 | exit /B %ERROR_CODE%
105 |
--------------------------------------------------------------------------------
/import.sh:
--------------------------------------------------------------------------------
1 | if [ ! -d lib ]; then
2 | echo lib directory of binary download missing. Please download the zip or run import-mvn.sh
3 | exit 1
4 | fi
5 |
6 | HEAP=4G
7 |
8 | # Detect Cygwin
9 | case `uname -s` in
10 | CYGWIN*)
11 | cygwin=1
12 | esac
13 |
14 | DB=${1-target/graph.db}
15 | shift
16 | NODES=${1-nodes.csv}
17 | shift
18 | RELS=${1-rels.csv}
19 | shift
20 | CP=""
21 | base=`dirname "$0"`
22 | if [ \! -z "$cygwin" ]; then
23 | wbase=`cygpath -w "$base"`
24 | fi
25 | curdir=`pwd`
26 | cd "$base"
27 | for i in lib/*.jar; do
28 | if [ -z "$cygwin" ]; then
29 | CP="$CP":"$base/$i"
30 | else
31 | i=`cygpath -w "$i"`
32 | CP="$CP;$wbase/$i"
33 | fi
34 | done
35 | cd "$curdir"
36 | #echo java -classpath $CP -Xmx$HEAP -Xms$HEAP -Dfile.encoding=UTF-8 org.neo4j.batchimport.Importer batch.properties "$DB" "$NODES" "$RELS" "$@"
37 | java -classpath "$CP" -Xmx$HEAP -Xms$HEAP -Dfile.encoding=UTF-8 org.neo4j.batchimport.Importer batch.properties "$DB" "$NODES" "$RELS" "$@"
38 |
--------------------------------------------------------------------------------
/import_csv.sh:
--------------------------------------------------------------------------------
1 | source ./settings.sh
2 |
3 | mvn clean test-compile exec:java -Dexec.mainClass=org.neo4j.batchimport.ParallelImporter -Dexec.classpathScope=test -Dexec.args="/mnt/parallel.db nodes.csv rels.csv 100000000 4 50 100 2 ONE,TWO,THREE,FOUR,FIVE,SIX,SEVEN,EIGHT,NINE,TEN"
4 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | org.neo4j
4 | batch-import
5 | 3.0.4
6 | Neo4j Batch Importer
7 |
8 | UTF-8
9 | 3.0.4
10 | GPL-3-header.txt
11 |
12 |
13 |
14 | Neo4j Snapshots
15 | http://m2.neo4j.org/content/repositories/snapshots
16 |
17 |
18 |
19 |
20 |
21 | GNU General Public License, Version 3
22 | http://www.gnu.org/licenses/gpl-3.0-standalone.html
23 | The software ("Software") developed and owned by Network Engine for
24 | Objects in Lund AB (referred to in this notice as "Neo Technology") is
25 | licensed under the GNU GENERAL PUBLIC LICENSE Version 3 to all third
26 | parties and that license is included below.
27 |
28 | However, if you have executed an End User Software License and Services
29 | Agreement or an OEM Software License and Support Services Agreement, or
30 | another commercial license agreement with Neo Technology or one of its
31 | affiliates (each, a "Commercial Agreement"), the terms of the license in
32 | such Commercial Agreement will supersede the GNU GENERAL PUBLIC LICENSE
33 | Version 3 and you may use the Software solely pursuant to the terms of
34 | the relevant Commercial Agreement.
35 |
36 |
37 |
38 |
39 |
40 |
41 | net.sf.opencsv
42 | opencsv
43 | 2.3
44 |
45 |
46 | org.mapdb
47 | mapdb
48 | 0.9.3
49 |
50 |
51 | junit
52 | junit
53 | 4.8.1
54 | test
55 |
56 |
57 | log4j
58 | log4j
59 | 1.2.17
60 |
61 |
62 | org.mockito
63 | mockito-core
64 | 1.8.5
65 | test
66 |
67 |
68 | org.neo4j
69 | neo4j-kernel
70 | ${neo4j.version}
71 |
72 |
73 | org.neo4j
74 | neo4j-enterprise
75 | ${neo4j.version}
76 |
77 |
78 | org.neo4j
79 | neo4j-lucene-index
80 | ${neo4j.version}
81 |
82 |
83 |
84 |
85 |
86 | org.apache.maven.plugins
87 | maven-compiler-plugin
88 | 2.1
89 |
90 | 1.7
91 | 1.7
92 |
93 |
94 |
95 | maven-assembly-plugin
96 |
97 |
98 | batch-import
99 |
100 |
101 |
102 | org.neo4j.batchimport.Importer
103 |
104 |
105 |
106 | jar-with-dependencies
107 |
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Neo4j (CSV) Batch Importer
2 |
3 | ## Neo4j 2.2+ neo4j-import tool
4 |
5 | 此工具的具体使用步骤可参见本人博客:https://my.oschina.net/u/2538940/blog/883829
6 |
7 | 直接下载已编译好的工具:
8 | https://github.com/mo9527/batch-import-tool
9 |
10 | 与原始版本比较,本版本主要做了以下修改:\
11 | 1、修复了导入.gz关系压缩文件时,win环境会出现关系无法导入的情况。\
12 | 2、增加程序对csv文件的容错性,即使csv文件的某一行出现编码或断行问题,也不会影响接下来数据的\
13 | 导入,前提是每一行数据的字节长度不超过5000。如果超过5000,可自行修改org.neo4j.batchimport.CSVParser\
14 | 文件的第171行。
15 |
16 |
17 | Since version 2.2.0 Neo4j comes with an **high performance import tool** out of the box that takes many ideas of this one, but is way more scalable across CPUs and has little memory requirements.
18 |
19 | The only cases that are not covered are repeated imports in existing stores and population of manual indexes. Please consider the built-in and officially supported tool first, before falling back onto this one.
20 |
21 | The simplest invocation is `/path/to/neo4j/bin/neo4j-import --into graph.db --nodes nodes.csv --relationships rels.csv` with the header format being similar to this one. For a quick intro check the [developer pages](http://neo4j.com/developer/guide-import-csv/#_super_fast_batch_importer_for_huge_datasets).
22 | There is much more to it, please see the [Neo4j reference manual](http://neo4j.com/docs/stable/import-tool.html).
23 |
24 | ## Licensing
25 |
26 | This software is licensed under the [GPLv3](http://www.gnu.org/licenses/gpl-3.0.en.html) for now.
27 | You can ask [Neo Technology](http://neotechnology.com) about a different licensing agreement.
28 |
29 | __Works with Neo4j 2.x__
30 |
31 | ## Binary Download
32 |
33 | To simply use it (no source/git/maven required):
34 | * [download 2.2 zip](https://dl.dropboxusercontent.com/u/14493611/batch_importer_22.zip)
35 | * unzip
36 | * run `import.sh test.db nodes.csv rels.csv` (on Windows: `import.bat`)
37 | * after the import point your `/path/to/neo4j/conf/neo4j-server.properties` to this `test.db` directory,
38 | or copy the data over to your server `cp -r test.db/* /path/to/neo4j/data/graph.db/`
39 |
40 | You provide one **tab separated** csv file for nodes and one for relationships (optionally more for indexes)
41 |
42 | Example data for the files is a small family network
43 |
44 | ## File format
45 |
46 | * **tab separated** csv files
47 | * Property names in first row.
48 | * If only one file is initially imported, the row number corresponds to the node-id (*starting with 0*)
49 | * Property values not listed will not be set on the nodes or relationships.
50 | * Optionally property fields can have a type (defaults to String) indicated with name:type where type is one of
51 | (int, long, float, double, boolean, byte, short, char, string). The string value is then converted to that type.
52 | Conversion failure will result in abort of the import operation.
53 | * There is a separate "label" type, which should be used for relationship types and/or node labels, (`labels:label`)
54 | * Property fields may also be arrays by adding "_array" to the types above and separating the data with commas.
55 | * for non-ascii characters make sure to add `-Dfile.encoding=UTF-8` to the commandline arguments
56 | * Optionally automatic indexing of properties can be configured with a header like `name:string:users` and a configured index in `batch.properties` like `batch_import.node_index=exact`
57 | then the property `name` will be indexed in the `users` index for each row with a value there
58 | * multiple files for nodes and rels, comma separated, without spaces like "node1.csv,node2.csv"
59 | * you can specify concrete, externally provided node-id's with: `i:id`, both in the node and relationship-files
60 | * csv files can be zipped individually as *.gz or *.zip
61 |
62 | ## Examples
63 |
64 | There is also a `sample` directory, please run from the main directory `./import.sh test.db sample/nodes.csv sample/rels.csv`
65 |
66 | ### nodes.csv
67 |
68 | name l:label age works_on
69 | Michael Person,Father 37 neo4j
70 | Selina Person,Child 14
71 | Rana Person,Child 6
72 | Selma Person,Child 4
73 |
74 | ### rels.csv
75 |
76 | Note that the node-id references are numbered from 0 (since Neo4j 2.0)
77 |
78 | start end type since counter:int
79 | 0 1 FATHER_OF 1998-07-10 1
80 | 0 2 FATHER_OF 2007-09-15 2
81 | 0 3 FATHER_OF 2008-05-03 3
82 | 2 3 SISTER_OF 2008-05-03 5
83 | 1 2 SISTER_OF 2007-09-15 7
84 |
85 |
86 | ## Execution
87 |
88 | Just use the provided shell script `import.sh` or `import.bat` on Windows
89 |
90 | import.sh test.db nodes.csv rels.csv
91 |
92 |
93 | ### For Developers
94 |
95 | If you want to work on the code and run the importer after making changes:
96 |
97 | mvn clean compile exec:java -Dexec.mainClass="org.neo4j.batchimport.Importer" -Dexec.args="neo4j/data/graph.db nodes.csv rels.csv"
98 |
99 | or
100 |
101 | java -server -Dfile.encoding=UTF-8 -Xmx4G -jar target/batch-import-jar-with-dependencies.jar neo4j/data/graph.db nodes.csv rels.csv
102 |
103 |
104 | ynagzet:batchimport mh$ rm -rf target/db
105 | ynagzet:batchimport mh$ mvn clean compile assembly:single
106 | [INFO] Scanning for projects...
107 | [INFO] ------------------------------------------------------------------------
108 | [INFO] Building Simple Batch Importer
109 | [INFO] task-segment: [clean, compile, assembly:single]
110 | [INFO] ------------------------------------------------------------------------
111 | ...
112 | [INFO] Building jar: /Users/mh/java/neo/batchimport/target/batch-import-jar-with-dependencies.jar
113 | [INFO] ------------------------------------------------------------------------
114 | [INFO] BUILD SUCCESSFUL
115 | [INFO] ------------------------------------------------------------------------
116 | ynagzet:batchimport mh$ java -server -Xmx4G -jar target/batch-import-jar-with-dependencies.jar target/db nodes.csv rels.csv
117 | Physical mem: 16384MB, Heap size: 3640MB
118 |
119 | Configuration:
120 | use_memory_mapped_buffers=false
121 | neostore.nodestore.db.mapped_memory=200M
122 | neostore.relationshipstore.db.mapped_memory=1000M
123 | neostore.propertystore.db.mapped_memory=1000M
124 | neostore.propertystore.db.strings.mapped_memory=100M
125 | neostore.propertystore.db.arrays.mapped_memory=215M
126 | neo_store=/Users/mh/java/neo/batchimport/test.db
127 | dump_configuration=true
128 | cache_type=none
129 |
130 | ...........................................................................
131 | Importing 7500000 Nodes took 17 seconds
132 | ....................................................................................................35818 ms
133 | ....................................................................................................39343 ms
134 | ....................................................................................................41788 ms
135 | ....................................................................................................48897 ms
136 | ............
137 | Importing 41246740 Relationships took 170 seconds
138 | Total 212 seconds
139 | ynagzet:batchimport mh$ du -sh test.db
140 | 3,2G test.db
141 |
142 | ## Parameters
143 |
144 | *First parameter* MIGHT be the property-file name, if so it has to end with `.properties`, then this file will be used and all other parameters are consumed as usual
145 |
146 | *First parameter* - the graph database directory, a new db will be created in the directory except when `batch_import.keep_db=true` is set in `batch.properties`.
147 |
148 | *Second parameter* - a comma separated list of *node-csv-files*
149 |
150 | *Third parameter* - a comma separated list of *relationship-csv-files*
151 |
152 | It is also possible to specify those two file-lists in the config:
153 |
154 | ````
155 | batch_import.nodes_files=nodes1.csv[,nodes2.csv]
156 | batch_import.rels_files=rels1.csv[,rels2.csv]
157 | ````
158 |
159 | *Fourth parameter* - index configuration each a set of 4 values: `node_index users fulltext nodes_index.csv` or more generally: `node-or-rel-index index-name index-type index-file`
160 |
161 | This parameter set can be repeatedly used, see below. It is also possible to configure this in the config (`batch.properties`)
162 |
163 | ````
164 | batch_import.node_index.users=exact
165 | ````
166 |
167 | ## Schema indexes
168 |
169 | Currently schema indexes are not created by the batch-inserter, you could create them upfront and use `batch_import.keep_db=true` to work with the existing database.
170 | You then have the option of specifying labels for your nodes using a column header like `type:label` and a comma separated list of label values.
171 | Then on shutdown of the import Neo4j will populate the schema indexes with nodes with the appropriate labels and properties automatically.
172 | (The index creation is As a rough estimate the index creation will
173 |
174 | ## (Legacy) Indexing
175 |
176 | ### Indexing of inserted properties
177 |
178 | You can automatically index properties of nodes and relationships by adding ":indexName" to the property-column-header.
179 | Just configure the indexes in `batch.properties` like so:
180 |
181 | ````
182 | batch_import.node_index.users=exact
183 | ````
184 |
185 | ````
186 | name:string:users age works_on
187 | Michael 37 neo4j
188 | Selina 14
189 | Rana 6
190 | Selma 4
191 | ````
192 |
193 | **If you use `node_auto_index` as the index name, you can also initially populate Neo4j's automatic node index which is then
194 | later used and and updated while working with the database.**
195 |
196 |
197 | In the relationships-file you can optionally specify that the start and end-node should be looked up from the index in the same way
198 |
199 | ````
200 | name:string:users name:string:users type since counter:int
201 | Michael Selina FATHER_OF 1998-07-10 1
202 | Michael Rana FATHER_OF 2007-09-15 2
203 | Michael Selma FATHER_OF 2008-05-03 3
204 | Rana Selma SISTER_OF 2008-05-03 5
205 | Selina Rana SISTER_OF 2007-09-15 7
206 | ````
207 |
208 | ### Explicit Indexing
209 |
210 | Optionally you can add nodes and relationships to indexes.
211 |
212 | Add four arguments per each index to command line:
213 |
214 | To create a full text node index called users using nodes_index.csv:
215 |
216 | ````
217 | node_index users fulltext nodes_index.csv
218 | ````
219 |
220 | To create an exact relationship index called worked using rels_index.csv:
221 |
222 | ````
223 | rel_index worked exact rels_index.csv
224 | ````
225 |
226 | Example command line:
227 |
228 | ````
229 | ./import.sh test.db nodes.csv rels.csv node_index users fulltext nodes_index.csv rel_index worked exact rels_index.csv
230 | ````
231 |
232 | ### Using Neo4j's Automatic Indexing
233 |
234 | The auto-indexing elsewhere in this file pertains to the *batch inserter's* ability to automatically index. If you want to
235 | use this cool feature from the batch inserter, there's a little gotcha. You still need to enable the batch inserter's feature
236 | with `batch_import.node_index` but then instead of specifying the name of a regular index, specify the auto index's name like so:
237 |
238 | ````
239 | batch_import.node_index.node_auto_index=exact
240 | ````
241 |
242 | And you have to make sure to also enable automatic indexing in your regular Neo4j database's (`conf/neo4j.properties`) and
243 | specify the correct node properties to be indexed.
244 |
245 | ## Examples
246 |
247 | ### nodes_index.csv
248 |
249 | ````
250 | id name language
251 | 0 Victor Richards West Frisian
252 | 1 Virginia Shaw Korean
253 | 2 Lois Simpson Belarusian
254 | 3 Randy Bishop Hiri Motu
255 | 4 Lori Mendoza Tok Pisin
256 | ````
257 |
258 | ### rels_index.csv
259 |
260 | ````
261 | id property1 property2
262 | 0 cwqbnxrv rpyqdwhk
263 | 1 qthnrret tzjmmhta
264 | 2 dtztaqpy pbmcdqyc
265 | ````
266 |
267 | ## Configuration
268 |
269 | The Importer uses a supplied `batch.properties` file to be configured:
270 |
271 | #### Memory Mapping I/O Config
272 |
273 | Most important is the memory config, you should try to have enough RAM map as much of your store-files to memory as possible.
274 |
275 | At least the node-store and large parts of the relationship-store should be mapped. The property- and string-stores are mostly
276 | append only so don't need that much RAM. Below is an example for about 6GB RAM, to leave room for the heap and also OS and OS caches.
277 |
278 | ````
279 | cache_type=none
280 | use_memory_mapped_buffers=true
281 | # 14 bytes per node
282 | neostore.nodestore.db.mapped_memory=200M
283 | # 33 bytes per relationships
284 | neostore.relationshipstore.db.mapped_memory=3G
285 | # 38 bytes per property
286 | neostore.propertystore.db.mapped_memory=500M
287 | # 60 bytes per long-string block
288 | neostore.propertystore.db.strings.mapped_memory=500M
289 | neostore.propertystore.db.index.keys.mapped_memory=5M
290 | neostore.propertystore.db.index.mapped_memory=5M
291 | ````
292 |
293 | #### Indexes (experimental)
294 |
295 | ````
296 | batch_import.node_index.users=exact
297 | batch_import.node_index.articles=fulltext
298 | batch_import.relationship_index.friends=exact
299 | ````
300 |
301 | #### CSV (experimental)
302 |
303 | ````
304 | batch_import.csv.quotes=true // default, set to false for faster, experimental csv-reader
305 | batch_import.csv.delim=,
306 | ````
307 |
308 | ##### Index-Cache (experimental)
309 |
310 | ````
311 | batch_import.mapdb_cache.disable=true
312 | ````
313 |
314 | ##### Keep Database (experimental)
315 |
316 | ````
317 | batch_import.keep_db=true
318 | ````
319 |
320 | ## Utilities
321 |
322 | ### TestDataGenerator
323 |
324 | It is a dumb random test data generator (`org.neo4j.batchimport.TestDataGenerator`) that you can run with
325 |
326 | ./generate.sh #nodes #max-rels-per-node REL1,REL2,REL3 LABEL1,LABEL2,LABEL3
327 |
328 | Will generate nodes.csv and rels.csv for those numbers
329 |
330 |
331 | ### Relationship-Sorter
332 |
333 | Sorts a given relationship-CSV file by min(start,end) as required for the parallel sorter. Uses the data-pump sorter from mapdb
334 | for the actual sorting with a custom Comparator.
335 |
336 | `org.neo4j.batchimport.utils.RelationshipSorter` rels-input.csv rels-output.csv
337 |
338 |
339 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | . ./settings.sh
2 |
3 | mvn clean test-compile exec:java -Dexec.mainClass=org.neo4j.batchimport.DisruptorTest -Dexec.classpathScope=test
--------------------------------------------------------------------------------
/sample/batch.properties:
--------------------------------------------------------------------------------
1 | dump_configuration=false
2 | cache_type=none
3 | use_memory_mapped_buffers=true
4 | neostore.propertystore.db.index.keys.mapped_memory=5M
5 | neostore.propertystore.db.index.mapped_memory=5M
6 | neostore.nodestore.db.mapped_memory=200M
7 | neostore.relationshipstore.db.mapped_memory=500M
8 | neostore.propertystore.db.mapped_memory=200M
9 | neostore.propertystore.db.strings.mapped_memory=200M
10 |
11 | batch_import.node_index.users=exact
--------------------------------------------------------------------------------
/sample/import.sh:
--------------------------------------------------------------------------------
1 | echo "Run in main directory sh sample/import.sh"
2 | mvn test-compile exec:java -Dexec.mainClass="org.neo4j.batchimport.Importer" \
3 | -Dexec.args="sample/batch.properties target/graph.db sample/nodes.csv,sample/nodes2.csv sample/rels.csv"
--------------------------------------------------------------------------------
/sample/nodes.csv:
--------------------------------------------------------------------------------
1 | name:string:users age works_on
2 | Michael 37 neo4j
3 | Selina 14
--------------------------------------------------------------------------------
/sample/nodes2.csv:
--------------------------------------------------------------------------------
1 | name:string:users age works_on
2 | Rana 6
3 | Selma 4
--------------------------------------------------------------------------------
/sample/rels.csv:
--------------------------------------------------------------------------------
1 | name:string:users name:string:users type since counter:int
2 | Michael Selina FATHER_OF 1998-07-10 1
3 | Michael Rana FATHER_OF 2007-09-15 2
4 | Michael Selma FATHER_OF 2008-05-03 3
5 | Rana Selma SISTER_OF 2008-05-03 5
6 | Selina Rana SISTER_OF 2007-09-15 7
--------------------------------------------------------------------------------
/settings.sh:
--------------------------------------------------------------------------------
1 | MEMORY_OPTS="-Xmx50G -Xms50G -server -d64 -Xmn3g -XX:SurvivorRatio=2"
2 | GC_OPTS="-XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:ParallelCMSThreads=4 -XX:+CMSParallelRemarkEnabled -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycle=10 -XX:CMSFullGCsBeforeCompaction=1 "
3 |
4 | PRINT_GC_OPTS="-XX:+PrintTenuringDistribution -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:gc.log"
5 |
6 | # PROFILE_OPTS="-agentpath:/root/yourkit/bin/linux-x86-64/libyjpagent.so=port=10001"
7 |
8 | #-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintHeapAtGC -XX:+PrintGCTaskTimeStamps
9 |
10 | export MAVEN_OPTS="$PROFILE_OPTS $MEMORY_OPTS $GC_OPTS $PRINT_GC_OPTS"
--------------------------------------------------------------------------------
/sort.sh:
--------------------------------------------------------------------------------
1 | HEAP=4G
2 | IN=${1-rels.csv}
3 | shift
4 | OUT=${1-rels-sorted.csv}
5 | CP=""
6 | for i in lib/*.jar; do CP="$CP":"$i"; done
7 |
8 | echo java -classpath $CP -Xmx$HEAP -Xms$HEAP -Dfile.encoding=UTF-8 org.neo4j.batchimport.utils.RelationshipSorter "$IN" "$OUT"
9 | java -classpath $CP -Xmx$HEAP -Xms$HEAP -Dfile.encoding=UTF-8 org.neo4j.batchimport.utils.RelationshipSorter "$IN" "$OUT"
10 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/CSVParser.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | /**
4 | Copyright 2005 Bytecode Pty Ltd.
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.io.IOException;
20 | import java.util.ArrayList;
21 | import java.util.List;
22 |
23 | /**
24 | * A very simple CSV parser released under a commercial-friendly license.
25 | * This just implements splitting a single line into fields.
26 | *
27 | * @author Glen Smith
28 | * @author Rainer Pruy
29 | */
30 | public class CSVParser {
31 |
32 | private final char separator;
33 |
34 | private final char quotechar;
35 |
36 | private final char escape;
37 |
38 | private final boolean strictQuotes;
39 |
40 | private String pending;
41 | private boolean inField = false;
42 |
43 | private final boolean ignoreLeadingWhiteSpace;
44 |
45 | /**
46 | * The default separator to use if none is supplied to the constructor.
47 | */
48 | public static final char DEFAULT_SEPARATOR = ',';
49 |
50 | public static final int INITIAL_READ_SIZE = 128;
51 |
52 | /**
53 | * The default quote character to use if none is supplied to the
54 | * constructor.
55 | */
56 | public static final char DEFAULT_QUOTE_CHARACTER = '"';
57 |
58 |
59 | /**
60 | * The default escape character to use if none is supplied to the
61 | * constructor.
62 | */
63 | public static final char DEFAULT_ESCAPE_CHARACTER = '\\';
64 |
65 | /**
66 | * The default strict quote behavior to use if none is supplied to the
67 | * constructor
68 | */
69 | public static final boolean DEFAULT_STRICT_QUOTES = false;
70 |
71 | /**
72 | * The default leading whitespace behavior to use if none is supplied to the
73 | * constructor
74 | */
75 | public static final boolean DEFAULT_IGNORE_LEADING_WHITESPACE = true;
76 |
77 | /**
78 | * This is the "null" character - if a value is set to this then it is ignored.
79 | * I.E. if the quote character is set to null then there is no quote character.
80 | */
81 | public static final char NULL_CHARACTER = '\0';
82 |
83 | /**
84 | * Constructs CSVParser using a comma for the separator.
85 | */
86 | public CSVParser() {
87 | this(DEFAULT_SEPARATOR, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
88 | }
89 |
90 | /**
91 | * Constructs CSVParser with supplied separator.
92 | *
93 | * @param separator the delimiter to use for separating entries.
94 | */
95 | public CSVParser(char separator) {
96 | this(separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
97 | }
98 |
99 |
100 | /**
101 | * Constructs CSVParser with supplied separator and quote char.
102 | *
103 | * @param separator the delimiter to use for separating entries
104 | * @param quotechar the character to use for quoted elements
105 | */
106 | public CSVParser(char separator, char quotechar) {
107 | this(separator, quotechar, DEFAULT_ESCAPE_CHARACTER);
108 | }
109 |
110 | /**
111 | * Constructs CSVReader with supplied separator and quote char.
112 | *
113 | * @param separator the delimiter to use for separating entries
114 | * @param quotechar the character to use for quoted elements
115 | * @param escape the character to use for escaping a separator or quote
116 | */
117 | public CSVParser(char separator, char quotechar, char escape) {
118 | this(separator, quotechar, escape, DEFAULT_STRICT_QUOTES);
119 | }
120 |
121 | /**
122 | * Constructs CSVReader with supplied separator and quote char.
123 | * Allows setting the "strict quotes" flag
124 | *
125 | * @param separator the delimiter to use for separating entries
126 | * @param quotechar the character to use for quoted elements
127 | * @param escape the character to use for escaping a separator or quote
128 | * @param strictQuotes if true, characters outside the quotes are ignored
129 | */
130 | public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes) {
131 | this(separator, quotechar, escape, strictQuotes, DEFAULT_IGNORE_LEADING_WHITESPACE);
132 | }
133 |
134 | /**
135 | * Constructs CSVReader with supplied separator and quote char.
136 | * Allows setting the "strict quotes" and "ignore leading whitespace" flags
137 | *
138 | * @param separator the delimiter to use for separating entries
139 | * @param quotechar the character to use for quoted elements
140 | * @param escape the character to use for escaping a separator or quote
141 | * @param strictQuotes if true, characters outside the quotes are ignored
142 | * @param ignoreLeadingWhiteSpace if true, white space in front of a quote in a field is ignored
143 | */
144 | public CSVParser(char separator, char quotechar, char escape, boolean strictQuotes, boolean ignoreLeadingWhiteSpace) {
145 | if (anyCharactersAreTheSame(separator, quotechar, escape)) {
146 | throw new UnsupportedOperationException("The separator, quote, and escape characters must be different!");
147 | }
148 | if (separator == NULL_CHARACTER) {
149 | throw new UnsupportedOperationException("The separator character must be defined!");
150 | }
151 | this.separator = separator;
152 | this.quotechar = quotechar;
153 | this.escape = escape;
154 | this.strictQuotes = strictQuotes;
155 | this.ignoreLeadingWhiteSpace = ignoreLeadingWhiteSpace;
156 | }
157 |
158 | private boolean anyCharactersAreTheSame(char separator, char quotechar, char escape) {
159 | return isSameCharacter(separator, quotechar) || isSameCharacter(separator, escape) || isSameCharacter(quotechar, escape);
160 | }
161 |
162 | private boolean isSameCharacter(char c1, char c2) {
163 | return c1 != NULL_CHARACTER && c1 == c2;
164 | }
165 |
166 | /**
167 | * @return true if something was left over from last call(s)
168 | */
169 | public boolean isPending() {
170 | //防止csv文件断行出现错误,这里进行适当的介入,当pengding超过5000的长度时,则假定csv的断行出现了问题,防止假死,杜绝吃死内存
171 | if (pending != null && pending.length() > 5000){
172 | pending = null;
173 | return false;
174 | }
175 | return pending != null;
176 | }
177 |
178 | public String[] parseLineMulti(String nextLine) throws IOException {
179 | return parseLine(nextLine, true);
180 | }
181 |
182 | public String[] parseLine(String nextLine) throws IOException {
183 | return parseLine(nextLine, false);
184 | }
185 |
186 | /**
187 | * Parses an incoming String and returns an array of elements.
188 | *
189 | * @param nextLine the string to parse
190 | * @param multi
191 | * @return the comma-tokenized list of elements, or null if nextLine is null
192 | * @throws IOException if bad things happen during the read
193 | */
194 | private String[] parseLine(String nextLine, boolean multi) throws IOException {
195 |
196 | if (!multi && pending != null) {
197 | pending = null;
198 | }
199 |
200 | if (nextLine == null) {
201 | if (pending != null) {
202 | String s = pending;
203 | pending = null;
204 | return new String[]{s};
205 | } else {
206 | return null;
207 | }
208 | }
209 |
210 | List tokensOnThisLine = new ArrayList();
211 | StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
212 | boolean inQuotes = false;
213 | if (pending != null) {
214 | sb.append(pending);
215 | pending = null;
216 | inQuotes = true;
217 | }
218 | for (int i = 0; i < nextLine.length(); i++) {
219 |
220 | char c = nextLine.charAt(i);
221 | if (c == this.escape) {
222 | if (isNextCharacterEscapable(nextLine, inQuotes || inField, i)) {
223 | sb.append(nextLine.charAt(i + 1));
224 | i++;
225 | }
226 | } else if (c == quotechar) {
227 | if (isNextCharacterEscapedQuote(nextLine, inQuotes || inField, i)) {
228 | sb.append(nextLine.charAt(i + 1));
229 | i++;
230 | } else {
231 | //inQuotes = !inQuotes;
232 |
233 | // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
234 | if (!strictQuotes) {
235 | if (i > 2 //not on the beginning of the line
236 | && nextLine.charAt(i - 1) != this.separator //not at the beginning of an escape sequence
237 | && nextLine.length() > (i + 1) &&
238 | nextLine.charAt(i + 1) != this.separator //not at the end of an escape sequence
239 | ) {
240 |
241 | if (ignoreLeadingWhiteSpace && sb.length() > 0 && isAllWhiteSpace(sb)) {
242 | sb.setLength(0); //discard white space leading up to quote
243 | } else {
244 | sb.append(c);
245 | //continue;
246 | }
247 |
248 | }
249 | }
250 |
251 | inQuotes = !inQuotes;
252 | }
253 | inField = !inField;
254 | } else if (c == separator && !inQuotes) {
255 | tokensOnThisLine.add(sb.toString());
256 | sb.setLength(0); // start work on next token
257 | inField = false;
258 | } else {
259 | if (!strictQuotes || inQuotes) {
260 | sb.append(c);
261 | inField = true;
262 | }
263 | }
264 | }
265 | // line is done - check status
266 | if (inQuotes) {
267 | if (multi) {
268 | // continuing a quoted section, re-append newline
269 | sb.append("\n");
270 | pending = sb.toString();
271 | sb = null; // this partial content is not to be added to field list yet
272 | } else {
273 | throw new IOException("Un-terminated quoted field at end of CSV line");
274 | }
275 | }
276 | if (sb != null) {
277 | tokensOnThisLine.add(sb.toString());
278 | }
279 | return tokensOnThisLine.toArray(new String[tokensOnThisLine.size()]);
280 |
281 | }
282 |
283 | /**
284 | * precondition: the current character is a quote or an escape
285 | *
286 | * @param nextLine the current line
287 | * @param inQuotes true if the current context is quoted
288 | * @param i current index in line
289 | * @return true if the following character is a quote
290 | */
291 | private boolean isNextCharacterEscapedQuote(String nextLine, boolean inQuotes, int i) {
292 | return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
293 | && nextLine.length() > (i + 1) // there is indeed another character to check.
294 | && nextLine.charAt(i + 1) == quotechar;
295 | }
296 |
297 | /**
298 | * precondition: the current character is an escape
299 | *
300 | * @param nextLine the current line
301 | * @param inQuotes true if the current context is quoted
302 | * @param i current index in line
303 | * @return true if the following character is a quote
304 | */
305 | protected boolean isNextCharacterEscapable(String nextLine, boolean inQuotes, int i) {
306 | return inQuotes // we are in quotes, therefore there can be escaped quotes in here.
307 | && nextLine.length() > (i + 1) // there is indeed another character to check.
308 | && (nextLine.charAt(i + 1) == quotechar || nextLine.charAt(i + 1) == this.escape);
309 | }
310 |
311 | /**
312 | * precondition: sb.length() > 0
313 | *
314 | * @param sb A sequence of characters to examine
315 | * @return true if every character in the sequence is whitespace
316 | */
317 | protected boolean isAllWhiteSpace(CharSequence sb) {
318 | boolean result = true;
319 | for (int i = 0; i < sb.length(); i++) {
320 | char c = sb.charAt(i);
321 |
322 | if (!Character.isWhitespace(c)) {
323 | return false;
324 | }
325 | }
326 | return result;
327 | }
328 | }
329 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/CSVReader.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | /**
4 | Copyright 2005 Bytecode Pty Ltd.
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.io.BufferedReader;
20 | import java.io.Closeable;
21 | import java.io.IOException;
22 | import java.io.Reader;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 |
26 | /**
27 | * A very simple CSV reader released under a commercial-friendly license.
28 | *
29 | * @author Glen Smith
30 | *
31 | */
32 | public class CSVReader implements Closeable {
33 |
34 | private BufferedReader br;
35 |
36 | private boolean hasNext = true;
37 |
38 | private CSVParser parser;
39 |
40 | private int skipLines;
41 |
42 | private boolean linesSkiped;
43 |
44 | /**
45 | * The default line to start reading.
46 | */
47 | public static final int DEFAULT_SKIP_LINES = 0;
48 |
49 | /**
50 | * Constructs CSVReader using a comma for the separator.
51 | *
52 | * @param reader
53 | * the reader to an underlying CSV source.
54 | */
55 | public CSVReader(Reader reader) {
56 | this(reader, CSVParser.DEFAULT_SEPARATOR, CSVParser.DEFAULT_QUOTE_CHARACTER, CSVParser.DEFAULT_ESCAPE_CHARACTER);
57 | }
58 |
59 | /**
60 | * Constructs CSVReader with supplied separator.
61 | *
62 | * @param reader
63 | * the reader to an underlying CSV source.
64 | * @param separator
65 | * the delimiter to use for separating entries.
66 | */
67 | public CSVReader(Reader reader, char separator) {
68 | this(reader, separator, CSVParser.DEFAULT_QUOTE_CHARACTER, CSVParser.DEFAULT_ESCAPE_CHARACTER);
69 | }
70 |
71 | /**
72 | * Constructs CSVReader with supplied separator and quote char.
73 | *
74 | * @param reader
75 | * the reader to an underlying CSV source.
76 | * @param separator
77 | * the delimiter to use for separating entries
78 | * @param quotechar
79 | * the character to use for quoted elements
80 | */
81 | public CSVReader(Reader reader, char separator, char quotechar) {
82 | this(reader, separator, quotechar, CSVParser.DEFAULT_ESCAPE_CHARACTER, DEFAULT_SKIP_LINES, CSVParser.DEFAULT_STRICT_QUOTES);
83 | }
84 |
85 | /**
86 | * Constructs CSVReader with supplied separator, quote char and quote handling
87 | * behavior.
88 | *
89 | * @param reader
90 | * the reader to an underlying CSV source.
91 | * @param separator
92 | * the delimiter to use for separating entries
93 | * @param quotechar
94 | * the character to use for quoted elements
95 | * @param strictQuotes
96 | * sets if characters outside the quotes are ignored
97 | */
98 | public CSVReader(Reader reader, char separator, char quotechar, boolean strictQuotes) {
99 | this(reader, separator, quotechar, CSVParser.DEFAULT_ESCAPE_CHARACTER, DEFAULT_SKIP_LINES, strictQuotes);
100 | }
101 |
102 | /**
103 | * Constructs CSVReader with supplied separator and quote char.
104 | *
105 | * @param reader
106 | * the reader to an underlying CSV source.
107 | * @param separator
108 | * the delimiter to use for separating entries
109 | * @param quotechar
110 | * the character to use for quoted elements
111 | * @param escape
112 | * the character to use for escaping a separator or quote
113 | */
114 |
115 | public CSVReader(Reader reader, char separator,
116 | char quotechar, char escape) {
117 | this(reader, separator, quotechar, escape, DEFAULT_SKIP_LINES, CSVParser.DEFAULT_STRICT_QUOTES);
118 | }
119 |
120 | /**
121 | * Constructs CSVReader with supplied separator and quote char.
122 | *
123 | * @param reader
124 | * the reader to an underlying CSV source.
125 | * @param separator
126 | * the delimiter to use for separating entries
127 | * @param quotechar
128 | * the character to use for quoted elements
129 | * @param line
130 | * the line number to skip for start reading
131 | */
132 | public CSVReader(Reader reader, char separator, char quotechar, int line) {
133 | this(reader, separator, quotechar, CSVParser.DEFAULT_ESCAPE_CHARACTER, line, CSVParser.DEFAULT_STRICT_QUOTES);
134 | }
135 |
136 | /**
137 | * Constructs CSVReader with supplied separator and quote char.
138 | *
139 | * @param reader
140 | * the reader to an underlying CSV source.
141 | * @param separator
142 | * the delimiter to use for separating entries
143 | * @param quotechar
144 | * the character to use for quoted elements
145 | * @param escape
146 | * the character to use for escaping a separator or quote
147 | * @param line
148 | * the line number to skip for start reading
149 | */
150 | public CSVReader(Reader reader, char separator, char quotechar, char escape, int line) {
151 | this(reader, separator, quotechar, escape, line, CSVParser.DEFAULT_STRICT_QUOTES);
152 | }
153 |
154 | /**
155 | * Constructs CSVReader with supplied separator and quote char.
156 | *
157 | * @param reader
158 | * the reader to an underlying CSV source.
159 | * @param separator
160 | * the delimiter to use for separating entries
161 | * @param quotechar
162 | * the character to use for quoted elements
163 | * @param escape
164 | * the character to use for escaping a separator or quote
165 | * @param line
166 | * the line number to skip for start reading
167 | * @param strictQuotes
168 | * sets if characters outside the quotes are ignored
169 | */
170 | public CSVReader(Reader reader, char separator, char quotechar, char escape, int line, boolean strictQuotes) {
171 | this(reader, separator, quotechar, escape, line, strictQuotes, CSVParser.DEFAULT_IGNORE_LEADING_WHITESPACE);
172 | }
173 |
174 | /**
175 | * Constructs CSVReader with supplied separator and quote char.
176 | *
177 | * @param reader
178 | * the reader to an underlying CSV source.
179 | * @param separator
180 | * the delimiter to use for separating entries
181 | * @param quotechar
182 | * the character to use for quoted elements
183 | * @param escape
184 | * the character to use for escaping a separator or quote
185 | * @param line
186 | * the line number to skip for start reading
187 | * @param strictQuotes
188 | * sets if characters outside the quotes are ignored
189 | * @param ignoreLeadingWhiteSpace
190 | * it true, parser should ignore white space before a quote in a field
191 | */
192 | public CSVReader(Reader reader, char separator, char quotechar, char escape, int line, boolean strictQuotes, boolean ignoreLeadingWhiteSpace) {
193 | this.br = new BufferedReader(reader);
194 | this.parser = new CSVParser(separator, quotechar, escape, strictQuotes, ignoreLeadingWhiteSpace);
195 | this.skipLines = line;
196 | }
197 |
198 | /**
199 | * Reads the entire file into a List with each element being a String[] of
200 | * tokens.
201 | *
202 | * @return a List of String[], with each String[] representing a line of the
203 | * file.
204 | *
205 | * @throws IOException
206 | * if bad things happen during the read
207 | */
208 | public List readAll() throws IOException {
209 |
210 | List allElements = new ArrayList();
211 | while (hasNext) {
212 | String[] nextLineAsTokens = readNext();
213 | if (nextLineAsTokens != null)
214 | allElements.add(nextLineAsTokens);
215 | }
216 | return allElements;
217 |
218 | }
219 |
220 | /**
221 | * Reads the next line from the buffer and converts to a string array.
222 | *
223 | * @return a string array with each comma-separated element as a separate
224 | * entry.
225 | *
226 | * @throws IOException
227 | * if bad things happen during the read
228 | */
229 | public String[] readNext() throws IOException {
230 |
231 | String[] result = null;
232 | do {
233 | String nextLine = getNextLine();
234 | if (!hasNext) {
235 | return result; // should throw if still pending?
236 | }
237 | String[] r = parser.parseLineMulti(nextLine);
238 | if (r.length > 0) {
239 | if (result == null) {
240 | result = r;
241 | } else {
242 | String[] t = new String[result.length+r.length];
243 | System.arraycopy(result, 0, t, 0, result.length);
244 | System.arraycopy(r, 0, t, result.length, r.length);
245 | result = t;
246 | }
247 | }
248 | } while (parser.isPending());
249 | return result;
250 | }
251 |
252 | /**
253 | * Reads the next line from the file.
254 | *
255 | * @return the next line from the file without trailing newline
256 | * @throws IOException
257 | * if bad things happen during the read
258 | */
259 | private String getNextLine() throws IOException {
260 | if (!this.linesSkiped) {
261 | for (int i = 0; i < skipLines; i++) {
262 | br.readLine();
263 | }
264 | this.linesSkiped = true;
265 | }
266 | String nextLine = br.readLine();
267 | if (nextLine == null) {
268 | hasNext = false;
269 | }
270 | return hasNext ? nextLine : null;
271 | }
272 |
273 | /**
274 | * Closes the underlying reader.
275 | *
276 | * @throws IOException if the close fails
277 | */
278 | public void close() throws IOException{
279 | br.close();
280 | }
281 |
282 | }
283 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/Importer.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import org.neo4j.batchimport.importer.ChunkerLineData;
4 | import org.neo4j.batchimport.importer.CsvLineData;
5 | import org.neo4j.batchimport.importer.RelType;
6 | import org.neo4j.batchimport.importer.Type;
7 | import org.neo4j.batchimport.index.MapDbCachingIndexProvider;
8 | import org.neo4j.batchimport.utils.Config;
9 | import org.neo4j.graphdb.DynamicLabel;
10 | import org.neo4j.graphdb.Label;
11 | import org.neo4j.graphdb.index.IndexManager;
12 | import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
13 | import org.neo4j.io.fs.FileUtils;
14 | import org.neo4j.unsafe.batchinsert.BatchInserter;
15 | import org.neo4j.unsafe.batchinsert.BatchInserters;
16 | import org.neo4j.unsafe.batchinsert.BatchInserterIndexProvider;
17 | import org.neo4j.unsafe.batchinsert.BatchInserterIndex;
18 |
19 | import java.io.*;
20 | import java.util.*;
21 | import java.util.zip.GZIPInputStream;
22 |
23 | import static org.neo4j.batchimport.Utils.join;
24 | import static org.neo4j.index.impl.lucene.legacy.LuceneIndexImplementation.EXACT_CONFIG;
25 | import static org.neo4j.index.impl.lucene.legacy.LuceneIndexImplementation.FULLTEXT_CONFIG;
26 |
27 | public class Importer {
28 | private static final Map SPATIAL_CONFIG = Collections.singletonMap(IndexManager.PROVIDER,"spatial");
29 | private static final Label[] NO_LABELS = new Label[0];
30 | public static final int BATCH = 10 * 1000 * 1000;
31 | private static Report report;
32 | private final Config config;
33 | private BatchInserter db;
34 | private BatchInserterIndexProvider indexProvider;
35 | Map indexes=new HashMap();
36 | private Label[] labelsArray = NO_LABELS;
37 |
38 | public Importer(File graphDb, final Config config) throws IOException {
39 | this.config = config;
40 | db = createBatchInserter(graphDb, config);
41 |
42 | final boolean luceneOnlyIndex = config.isCachedIndexDisabled();
43 | indexProvider = createIndexProvider(luceneOnlyIndex);
44 | Collection indexInfos = config.getIndexInfos();
45 | if (indexInfos!=null) {
46 | for (IndexInfo indexInfo : indexInfos) {
47 | BatchInserterIndex index = indexInfo.isNodeIndex() ? nodeIndexFor(indexInfo.indexName, indexInfo.indexType) : relationshipIndexFor(indexInfo.indexName, indexInfo.indexType);
48 | indexes.put(indexInfo.indexName, index);
49 | }
50 | }
51 |
52 | report = createReport();
53 | }
54 |
55 | protected StdOutReport createReport() {
56 | return new StdOutReport(BATCH, 100);
57 | }
58 |
59 | protected BatchInserterIndexProvider createIndexProvider(boolean luceneOnlyIndex) {
60 | return luceneOnlyIndex ? new LuceneBatchInserterIndexProvider(db) : new MapDbCachingIndexProvider(db);
61 | }
62 |
63 | protected BatchInserter createBatchInserter(File graphDb, Config config) throws IOException {
64 | return BatchInserters.inserter(new File(graphDb.getAbsolutePath()), config.getConfigData());
65 | }
66 |
67 | // todo multiple nodes and rels files
68 | // todo nodes and rels-files in config
69 | // todo graphdb in config
70 | public static void main(String... args) throws IOException {
71 | System.err.println("Usage: Importer data/dir nodes.csv relationships.csv [node_index node-index-name fulltext|exact nodes_index.csv rel_index rel-index-name fulltext|exact rels_index.csv ....]");
72 | System.err.println("Using: Importer "+join(args," "));
73 | System.err.println();
74 |
75 | final Config config = Config.convertArgumentsToConfig(args);
76 |
77 | File graphDb = new File(config.getGraphDbDirectory());
78 | if (graphDb.exists() && !config.keepDatabase()) {
79 | FileUtils.deleteRecursively(graphDb);
80 | }
81 |
82 | Importer importer = new Importer(graphDb, config);
83 | importer.doImport();
84 | }
85 |
86 | void finish() {
87 | indexProvider.shutdown();
88 | db.shutdown();
89 | report.finish();
90 | }
91 |
92 | void importNodes(Reader reader) throws IOException {
93 | final LineData data = createLineData(reader, 0);
94 | report.reset();
95 | boolean hasId = data.hasId();
96 | //重复的id不会再次建立节点 含泪删掉,因为速度太慢了
97 | // List allIds = new LinkedList();
98 | // allIds.clear();
99 | while (data.processLine(null)) {
100 | Map properties = data.getProperties();
101 | // if (properties.get("id") == null || properties.get("id") == "" || allIds.contains(properties.get("id"))){
102 | // continue;
103 | // }
104 | // allIds.add(properties.get("id"));
105 |
106 | String[] labels = data.getTypeLabels();
107 | long id;
108 | if (hasId) {
109 | id = data.getId();
110 | db.createNode(id, properties, labelsFor(labels));
111 | } else {
112 | id = db.createNode(properties, labelsFor(labels));
113 | }
114 | for (Map.Entry> entry : data.getIndexData().entrySet()) {
115 | final BatchInserterIndex index = indexFor(entry.getKey());
116 | if (index==null)
117 | throw new IllegalStateException("Index "+entry.getKey()+" not configured.");
118 | index.add(id, entry.getValue());
119 | }
120 | report.dots();
121 | if (report.getCount() % BATCH == 0) flushIndexes();
122 | }
123 | flushIndexes();
124 | report.finishImport("Nodes");
125 | }
126 |
127 | private Map trimDataValue(LineData data) {
128 | Map properties = data.getProperties();
129 | for (Map.Entry entry : properties.entrySet()){
130 | String key = entry.getKey();
131 | Object value = entry.getValue();
132 | if (value != null ){
133 | properties.put(key, value.toString().trim());
134 | }
135 | }
136 | return properties;
137 | }
138 |
139 | private Label[] labelsFor(String[] labels) {
140 | if (labels == null || labels.length == 0) return NO_LABELS;
141 | if (labels.length != labelsArray.length) labelsArray = new Label[labels.length];
142 | for (int i = labels.length - 1; i >= 0; i--) {
143 | if (labelsArray[i] == null || !labelsArray[i].name().equals(labels[i]))
144 | labelsArray[i] = DynamicLabel.label(labels[i]);
145 | }
146 | return labelsArray;
147 | }
148 |
149 | private long lookup(String index,String property,Object value) {
150 | Long id = null;
151 | try{
152 | id = indexFor(index).get(property, value).getSingle();
153 | }catch (Exception e){
154 | e.printStackTrace();
155 | id = null;
156 | }
157 |
158 | return id==null ? -1 : id;
159 | }
160 |
161 | private BatchInserterIndex indexFor(String index) {
162 | return indexes.get(index);
163 | }
164 |
165 | void importRelationships(Reader reader) throws IOException {
166 | final int offset = 3;
167 | final LineData data = createLineData(reader, offset);
168 | final RelType relType = new RelType();
169 | long skipped=0;
170 | report.reset();
171 |
172 | while (data.processLine(null)) {
173 | final Map properties = data.getProperties();
174 | final long start = id(data, 0);
175 | final long end = id(data, 1);
176 | if (start==-1 || end==-1) {
177 | skipped++;
178 | continue;
179 | }
180 | RelType type = null;
181 | try {
182 | type = relType.update(data.getRelationshipTypeLabel());
183 | }catch (Exception e){
184 | skipped++;
185 | continue;
186 | }
187 |
188 | final long id = db.createRelationship(start, end, type, properties);
189 | for (Map.Entry> entry : data.getIndexData().entrySet()) {
190 | indexFor(entry.getKey()).add(id, entry.getValue());
191 | }
192 | report.dots();
193 | }
194 | String msg = "Relationships";
195 | if (skipped > 0) msg += " skipped (" + skipped + ")";
196 | report.finishImport(msg);
197 | }
198 |
199 | private void flushIndexes() {
200 | for (BatchInserterIndex index : indexes.values()) {
201 | index.flush();
202 | }
203 | }
204 |
205 | private LineData createLineData(Reader reader, int offset) {
206 | final boolean useQuotes = config.quotesEnabled();
207 | if (useQuotes) return new CsvLineData(reader, config.getDelimChar(this),offset);
208 | return new ChunkerLineData(reader, config.getDelimChar(this), offset);
209 | }
210 |
211 | private long id(LineData data, int column) {
212 | final LineData.Header header = data.getHeader()[column];
213 | final Object value = data.getValue(column);
214 | if (header.indexName == null || header.type == Type.ID) {
215 | return id(value);
216 | }
217 | // System.out.println("indexName: " + header.indexName);
218 | // System.out.println("name: " + header.name);
219 | // System.out.println("value: " + value);
220 | return lookup(header.indexName, header.name, value);
221 | }
222 |
223 | void importIndex(String indexName, BatchInserterIndex index, Reader reader) throws IOException {
224 | final LineData data = createLineData(reader, 1);
225 | report.reset();
226 | while (data.processLine(null)) {
227 | final Map properties = data.getProperties();
228 | index.add(id(data.getValue(0)), properties);
229 | report.dots();
230 | }
231 |
232 | report.finishImport("Done inserting into " + indexName + " Index");
233 | }
234 |
235 | private BatchInserterIndex nodeIndexFor(String indexName, String indexType) {
236 | return indexProvider.nodeIndex(indexName, configFor(indexType));
237 | }
238 |
239 | private BatchInserterIndex relationshipIndexFor(String indexName, String indexType) {
240 | return indexProvider.relationshipIndex(indexName, configFor(indexType));
241 | }
242 |
243 | private Map configFor(String indexType) {
244 | if (indexType.equalsIgnoreCase("fulltext")) return FULLTEXT_CONFIG;
245 | if (indexType.equalsIgnoreCase("spatial")) return SPATIAL_CONFIG;
246 | return EXACT_CONFIG;
247 | }
248 |
249 | private long id(Object id) {
250 | return Long.parseLong(id.toString());
251 | }
252 |
253 | private void importIndex(IndexInfo indexInfo) throws IOException {
254 | File indexFile = new File(indexInfo.indexFileName);
255 | if (!indexFile.exists()) {
256 | System.err.println("Index file "+indexFile+" does not exist");
257 | return;
258 | }
259 | importIndex(indexInfo.indexName, indexes.get(indexInfo.indexName), createFileReader(indexFile));
260 | }
261 |
262 | private void doImport() throws IOException {
263 | try {
264 | for (File file : config.getNodesFiles()) {
265 | System.out.println("importing node file name : " + file.getName());
266 | importNodes(createFileReader(file));
267 | }
268 |
269 | for (File file : config.getRelsFiles()) {
270 | System.out.println("importing rel file name : " + file.getName());
271 | importRelationships(createFileReader(file));
272 | }
273 |
274 | for (IndexInfo indexInfo : config.getIndexInfos()) {
275 | if (indexInfo.shouldImportFile()) importIndex(indexInfo);
276 | }
277 | } finally {
278 | finish();
279 | }
280 | }
281 |
282 | final static int BUFFERED_READER_BUFFER = 4096*512;
283 |
284 | private Reader createFileReader(File file) {
285 | try {
286 | final String fileName = file.getName();
287 | if (fileName.endsWith(".gz") || fileName.endsWith(".zip")) {
288 | return new InputStreamReader(new GZIPInputStream(new BufferedInputStream(new FileInputStream(file)),BUFFERED_READER_BUFFER));
289 | }
290 | final FileReader fileReader = new FileReader(file);
291 | return new BufferedReader(fileReader,BUFFERED_READER_BUFFER);
292 | } catch(Exception e) {
293 | throw new IllegalArgumentException("Error reading file "+file+" "+e.getMessage(),e);
294 | }
295 | }
296 |
297 | }
298 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/IndexInfo.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import java.io.File;
4 | import java.util.Map;
5 |
6 | /**
7 | * @author mh
8 | * @since 11.06.13
9 | */
10 | public class IndexInfo {
11 | public IndexInfo(String[] args, int offset) {
12 | this.elementType = args[offset];
13 | this.indexName = args[offset+1];
14 | this.indexType = args[offset+2];
15 | this.indexFileName = args[offset+3];
16 | }
17 |
18 | public IndexInfo(String elementType, String indexName, String indexType, String indexFileName) {
19 | if (!(elementType.equals("node_index") || elementType.equals("relationship_index"))) throw new IllegalArgumentException("ElementType has to be node_index or relationship_index, but is "+elementType);
20 | if (!(indexType.equals("exact") || indexType.equals("fulltext"))) throw new IllegalArgumentException("IndexType has to be exact or fulltext, but is "+indexType);
21 | this.elementType = elementType;
22 | this.indexName = indexName;
23 | this.indexType = indexType;
24 | this.indexFileName = indexFileName;
25 | }
26 |
27 | public final String elementType, indexName, indexType, indexFileName;
28 |
29 | public static IndexInfo fromConfigEntry(Map.Entry entry) {
30 | if (!entry.getKey().matches("^batch_import\\.(node|relationship)_index\\..+")) return null;
31 | final String[] keyParts = entry.getKey().split("\\.", 3);
32 | final String elementType = keyParts[1];
33 | final String indexName = keyParts[2];
34 | final String[] valueParts = entry.getValue().split(":");
35 | final String indexType = valueParts[0];
36 | final String indexFileName = valueParts.length > 1 ? valueParts[1] : null;
37 | return new IndexInfo(elementType,indexName,indexType,indexFileName);
38 | }
39 |
40 | public boolean isNodeIndex() {
41 | return elementType.equals("node_index");
42 | }
43 |
44 | public String getConfigKey() {
45 | return "batch_import."+elementType+"."+indexName;
46 | }
47 |
48 | public String getConfigValue() {
49 | if (indexFileName==null) return indexType;
50 | return indexType+":"+indexFileName;
51 | }
52 |
53 | public Map addToConfig(Map config) {
54 | config.put(getConfigKey(), getConfigValue());
55 | return config;
56 | }
57 |
58 | public boolean shouldImportFile() {
59 | if (indexFileName == null) return false;
60 | final File file = new File(indexFileName);
61 | return file.exists() && file.isFile() && file.canRead();
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/LineData.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import org.neo4j.batchimport.importer.Type;
4 |
5 | import java.util.Map;
6 |
7 | public interface LineData {
8 |
9 | class Header {
10 | public Header(int column, String name, Type type, String indexName) {
11 | this.column = column;
12 | this.name = name;
13 | this.type = type;
14 | this.indexName = indexName;
15 | }
16 |
17 | public final int column;
18 | public final String name;
19 | public final Type type;
20 | public final String indexName; // todo index config in config
21 |
22 | @Override
23 | public String toString() {
24 | return column + ". " + name +
25 | (type!=null ? " type: " + type : "")+
26 | (indexName!=null? " index: " + indexName : "");
27 | }
28 | }
29 | boolean processLine(String line);
30 | Header[] getHeader();
31 | long getId();
32 | Map getProperties();
33 | Map> getIndexData();
34 | String[] getTypeLabels();
35 | String getRelationshipTypeLabel();
36 | Object getValue(int column);
37 | boolean hasId();
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/Report.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | /**
4 | * @author mh
5 | * @since 21.08.12
6 | */
7 | public interface Report {
8 | void reset();
9 |
10 | void finish();
11 |
12 | void dots();
13 |
14 | void finishImport(String type);
15 |
16 | long getCount();
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/StdOutReport.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | public class StdOutReport implements Report {
4 | private final long batch;
5 | private final long dots;
6 | private long count;
7 | private long total = System.currentTimeMillis(), time, batchTime;
8 |
9 | public StdOutReport(long batch, int dots) {
10 | this.batch = batch;
11 | this.dots = batch / dots;
12 | }
13 |
14 | @Override
15 | public void reset() {
16 | count = 0;
17 | batchTime = time = System.currentTimeMillis();
18 | }
19 |
20 | @Override
21 | public void finish() {
22 | System.out.println("\nTotal import time: "+ (System.currentTimeMillis() - total) / 1000 + " seconds ");
23 | }
24 |
25 | @Override
26 | public void dots() {
27 | if ((++count % dots) != 0) return;
28 | System.out.print(".");
29 | if ((count % batch) != 0) return;
30 | long now = System.currentTimeMillis();
31 | System.out.println(" "+ (now - batchTime) + " ms for "+batch);
32 | batchTime = now;
33 | }
34 |
35 | public long getCount() {
36 | return count;
37 | }
38 |
39 | @Override
40 | public void finishImport(String type) {
41 | System.out.println("\nImporting " + count + " " + type + " took " + (System.currentTimeMillis() - time) / 1000 + " seconds ");
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/Utils.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import org.apache.log4j.Logger;
4 |
5 | /**
6 | * @author mh
7 | * @since 27.10.12
8 | */
9 | public class Utils {
10 | private final static Logger log = Logger.getLogger(Utils.class);
11 |
12 | public static int size(int[] ids) {
13 | if (ids==null) return 0;
14 | int count = ids.length;
15 | for (int i=count-1;i>=0;i--) {
16 | if (ids[i]!=-1) return i+1;
17 | }
18 | return count;
19 | }
20 |
21 | public static int size(long[] ids) {
22 | if (ids==null) return 0;
23 | int count = ids.length;
24 | for (int i=count-1;i>=0;i--) {
25 | if (ids[i]!=-1) return i+1;
26 | }
27 | return count;
28 | }
29 |
30 | static String join(String[] types, String delim) {
31 | StringBuilder sb =new StringBuilder();
32 | for (String type : types) {
33 | sb.append(type).append(delim);
34 | }
35 | return sb.substring(0, sb.length() - delim.length());
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/importer/AbstractLineData.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.importer;
2 |
3 | import org.neo4j.batchimport.LineData;
4 |
5 | import java.util.Arrays;
6 | import java.util.Collections;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | import static org.neo4j.helpers.collection.MapUtil.map;
11 |
12 | public abstract class AbstractLineData implements LineData {
13 | protected final int offset;
14 | protected Object[] lineData;
15 | protected int lineSize;
16 | protected Header[] headers;
17 | int labelId = 2;
18 | int explicitLabelId = -1;
19 | private Object[] properties;
20 | protected int rows;
21 | private int propertyCount;
22 | private boolean hasIndex=false;
23 | private boolean done;
24 | private boolean hasId;
25 |
26 | public AbstractLineData(int offset) {
27 | this.offset = offset;
28 | }
29 |
30 | protected void initHeaders(Header[] headers) {
31 | this.headers = headers;
32 | lineSize=headers.length;
33 | lineData = new Object[lineSize];
34 | }
35 | protected abstract String[] readRawRow();
36 |
37 | protected abstract boolean readLine();
38 |
39 | protected Header[] createHeaders(String[] fields) {
40 | if (fields[0].indexOf(".csv") != -1){
41 | String firstLine = fields[0];
42 | firstLine = firstLine.substring(firstLine.length() - 50, firstLine.length()).trim();
43 | fields[0] = firstLine;
44 | }
45 |
46 | Header[] headers = new Header[fields.length];
47 | int i=0;
48 | for (String field : fields) {
49 | String[] parts=field.split(":");
50 | final String name = parts[0];
51 | final String indexName = parts.length > 2 ? parts[2] : null;
52 | Type type = Type.fromString(parts.length > 1 ? parts[1] : null);
53 | if (type==Type.LABEL) { // || name.toLowerCase().matches("^(type|types|label|labels)$")) {
54 | labelId=i;
55 | type=Type.LABEL;
56 | explicitLabelId = i;
57 | }
58 | headers[i]=new Header(i, name, type, indexName);
59 | i++;
60 | hasIndex |= indexName != null;
61 | }
62 | hasId = headers[0].type == Type.ID;
63 | return headers;
64 | }
65 |
66 | protected Object[] createMapData(int lineSize, int offset) {
67 | int dataSize = Math.max(0,lineSize - offset);
68 | properties = new Object[dataSize*2];
69 | for (int i = offset; i < dataSize; i++) {
70 | properties[(i - offset) * 2 ] = headers[i].name;
71 | }
72 | return properties;
73 | }
74 |
75 | @Override
76 | public boolean processLine(String line) {
77 | if (done) return false;
78 | return parse() > 0;
79 | }
80 |
81 | @Override
82 | public Header[] getHeader() {
83 | return headers;
84 | }
85 |
86 | @Override
87 | public long getId() {
88 | if (hasId) return (Long)getValue(0);
89 | return rows;
90 | }
91 |
92 | @Override
93 | public Map getProperties() {
94 | return properties();
95 | }
96 |
97 | @Override
98 | public Map> getIndexData() {
99 | if (!hasIndex) return Collections.EMPTY_MAP;
100 | Map> indexData = new HashMap>();
101 | for (int column = offset; column < headers.length; column++) {
102 | Header header = headers[column];
103 | if (header.indexName == null) continue;
104 | Object val = getValue(column);
105 | if (val == null) continue;
106 |
107 | if (!indexData.containsKey(header.indexName)) {
108 | indexData.put(header.indexName, new HashMap());
109 | }
110 | indexData.get(header.indexName).put(header.name,val);
111 | }
112 | return indexData;
113 | }
114 |
115 | @Override
116 | public String[] getTypeLabels() {
117 | if (explicitLabelId==-1) return null;
118 | Object labels = getValue(explicitLabelId);
119 | return labels instanceof String ? new String[]{ labels.toString() } : (String[]) labels;
120 | }
121 |
122 | @Override
123 | public String getRelationshipTypeLabel() {
124 | Object labels = getValue(labelId);
125 | return labels instanceof String[] ? ((String[])labels)[0] : (String)labels;
126 | }
127 |
128 | @Override
129 | public Object getValue(int column) {
130 | return lineData[column];
131 | }
132 |
133 | @Override
134 | public boolean hasId() {
135 | return hasId;
136 | }
137 |
138 | private Header getHeader(int column) {
139 | return headers[column];
140 | }
141 |
142 | private int parse() {
143 | rows++;
144 | Arrays.fill(lineData,null);
145 | done = !readLine();
146 | return collectNonNullInData();
147 | }
148 |
149 | private int collectNonNullInData() {
150 | propertyCount=0;
151 | int notnull = 0;
152 | for (int i = 0; i < lineSize; i++) {
153 | if (lineData[i] == null) continue;
154 | notnull++;
155 | if (i updateMap(Object... header) {
165 | processLine(null);
166 |
167 | // todo deprecate
168 | if (header.length > 0) {
169 | System.arraycopy(lineData, 0, header, 0, header.length);
170 | }
171 |
172 | return properties();
173 | }
174 |
175 | private Map properties() {
176 | if (propertyCount == properties.length) {
177 | return map(properties);
178 | }
179 | Object[] newData=new Object[propertyCount];
180 | System.arraycopy(properties,0,newData,0, propertyCount);
181 | return map(newData);
182 | }
183 |
184 | public int getColumnCount() {
185 | return this.propertyCount/2;
186 | }
187 |
188 | protected Object convert(int column, String value) {
189 | try {
190 | return headers[column].type == Type.STRING ? value : headers[column].type.convert(value);
191 | } catch(Exception e) {
192 | // todo potentially skip?
193 | throw new RuntimeException("Error converting value row "+rows+" column "+headers[column]+" value "+value+" error: "+e.getClass().getSimpleName()+": "+e.getMessage(),e);
194 | }
195 | }
196 | }
197 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/importer/ChunkerLineData.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.importer;
2 |
3 | import org.neo4j.batchimport.utils.Chunker;
4 |
5 | import java.io.IOException;
6 | import java.io.Reader;
7 | import java.util.*;
8 |
9 | public class ChunkerLineData extends AbstractLineData {
10 | private final Chunker chunker;
11 |
12 | public ChunkerLineData(Reader reader, char delim, int offset) {
13 | super(offset);
14 | chunker = new Chunker(reader, delim);
15 | initHeaders(createHeaders(readRawRow()));
16 | createMapData(lineSize, offset);
17 | }
18 |
19 | protected String[] readRawRow() {
20 | String value;
21 | Collection result=new ArrayList();
22 | do {
23 | value = nextWord();
24 | if (Chunker.NO_VALUE != value && !isEndOfLineOrFile(value)) {
25 | result.add(value);
26 | }
27 | } while (!isEndOfLineOrFile(value));
28 | return result.toArray(new String[result.size()]);
29 | }
30 |
31 | private String nextWord() {
32 | try {
33 | return chunker.nextWord();
34 | } catch (IOException e) {
35 | throw new RuntimeException(e);
36 | }
37 | }
38 |
39 | protected boolean readLine() {
40 | String value = null;
41 | int i=0;
42 | do {
43 | value = nextWord();
44 | if (isEndOfLineOrFile(value)) break;
45 | if (i==lineSize) {
46 | do {
47 | value = nextWord();
48 | } while (!isEndOfLineOrFile(value)); // consume until EOL
49 | break;
50 | }
51 | lineData[i] = Chunker.NO_VALUE == value ? null : convert(i, value);
52 | i++;
53 | } while (!isEndOfLineOrFile(value));
54 | if (i 2 ? parts[2] : null;
42 | Type type = Type.fromString(parts.length > 1 ? parts[1] : null);
43 | if (type==Type.LABEL || name.toLowerCase().matches("^(type|types|label|labels)$")) {
44 | labelId=i;
45 | type=Type.LABEL;
46 | explicitLabelId=i;
47 | }
48 | headers[i]=new Header(i, name, type, indexName);
49 | hasIndex |= indexName != null;
50 | }
51 | hasId = headers[0].type == Type.ID;
52 | return headers;
53 | }
54 |
55 | private Object[] createMapData(int lineSize, int offset) {
56 | int dataSize = Math.max(0,lineSize - offset);
57 | properties = new Object[dataSize*2];
58 | for (int i = offset; i < dataSize; i++) {
59 | properties[(i - offset) * 2 ] = headers[i].name;
60 | }
61 | return properties;
62 | }
63 |
64 | @Override
65 | public boolean processLine(String line) {
66 | this.propertyCount = parse(line);
67 | return true;
68 | }
69 |
70 | @Override
71 | public Header[] getHeader() {
72 | return headers;
73 | }
74 |
75 | @Override
76 | public long getId() {
77 | if (hasId) return (Long)getValue(0);
78 | return rows;
79 | }
80 |
81 | @Override
82 | public boolean hasId() {
83 | return hasId;
84 | }
85 |
86 | @Override
87 | public Map getProperties() {
88 | return properties();
89 | }
90 |
91 | @Override
92 | public Map> getIndexData() {
93 | if (!hasIndex) return Collections.EMPTY_MAP;
94 | Map> indexData = new HashMap>();
95 | for (int column = 0; column < headers.length; column++) {
96 | Header header = headers[column];
97 | if (header.indexName == null) continue;
98 |
99 | if (!indexData.containsKey(header.indexName)) {
100 | indexData.put(header.indexName, new HashMap());
101 | }
102 | indexData.get(header.indexName).put(header.name,getValue(column));
103 | }
104 | return indexData;
105 | }
106 |
107 | @Override
108 | public String[] getTypeLabels() {
109 | if (explicitLabelId==-1) return null;
110 | Object labels = getValue(explicitLabelId);
111 | return labels instanceof String ? new String[]{ labels.toString() } : (String[]) labels;
112 | }
113 |
114 | @Override
115 | public String getRelationshipTypeLabel() {
116 | Object labels = getValue(labelId);
117 | return labels instanceof String[] ? ((String[])labels)[0] : (String)labels;
118 | }
119 |
120 | @Override
121 | public Object getValue(int column) {
122 | return getHeader(column).type.convert(lineData[column]);
123 | }
124 |
125 | private Header getHeader(int column) {
126 | return headers[column];
127 | }
128 |
129 | private int parse(String line) {
130 | rows++;
131 | final StringTokenizer st = new StringTokenizer(line, delim,true);
132 | for (int i = 0; i < lineSize; i++) {
133 | String value = st.hasMoreTokens() ? st.nextToken() : delim;
134 | if (value.equals(delim)) {
135 | lineData[i] = null;
136 | } else {
137 | lineData[i] = value.trim().isEmpty() ? null : value;
138 | if (i< lineSize -1 && st.hasMoreTokens()) st.nextToken();
139 | }
140 | }
141 | return collectNonNullInData();
142 | }
143 |
144 | private int collectNonNullInData() {
145 | int count = 0;
146 | for (int i = offset; i < lineSize; i++) {
147 | if (lineData[i] == null) continue;
148 | final Header header = getHeader(i);
149 | properties[count++]= header.name;
150 | properties[count++]= getValue(i);
151 | }
152 | return count;
153 | }
154 |
155 | public Map updateMap(String line, Object... header) {
156 | processLine(line);
157 |
158 | // todo deprecate
159 | if (header.length > 0) {
160 | System.arraycopy(lineData, 0, header, 0, header.length);
161 | }
162 |
163 | return properties();
164 | }
165 |
166 | private Map properties() {
167 | if (propertyCount == properties.length) {
168 | return map(properties);
169 | }
170 | Object[] newData=new Object[propertyCount];
171 | System.arraycopy(properties,0,newData,0, propertyCount);
172 | return map(newData);
173 | }
174 |
175 | public int getColumnCount() {
176 | return this.propertyCount/2;
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/importer/Type.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.importer;
2 |
3 | import org.neo4j.batchimport.utils.Config;
4 |
5 | public enum Type {
6 | ID {
7 | @Override
8 | public Object convert(String value) {
9 | return Long.parseLong(value);
10 | }
11 | public boolean isProperty() { return false; }
12 | },
13 | LABEL {
14 | @Override
15 | public Object convert(String value) {
16 | return value.trim().split("\\s*,\\s*");
17 | }
18 | public boolean isProperty() { return false; }
19 | },
20 | BOOLEAN {
21 | @Override
22 | public Object convert(String value) {
23 | return Boolean.valueOf(value);
24 | }
25 | },
26 | INT {
27 | @Override
28 | public Object convert(String value) {
29 | return Integer.valueOf(value);
30 | }
31 | },
32 | LONG {
33 | @Override
34 | public Object convert(String value) {
35 | return Long.valueOf(value);
36 | }
37 | },
38 | DOUBLE {
39 | @Override
40 | public Object convert(String value) {
41 | return Double.valueOf(value);
42 | }
43 | },
44 | FLOAT {
45 | @Override
46 | public Object convert(String value) {
47 | return Float.valueOf(value);
48 | }
49 | },
50 | BYTE {
51 | @Override
52 | public Object convert(String value) {
53 | return Byte.valueOf(value);
54 | }
55 | },
56 | SHORT {
57 | @Override
58 | public Object convert(String value) {
59 | return Short.valueOf(value);
60 | }
61 | },
62 | CHAR {
63 | @Override
64 | public Object convert(String value) {
65 | return value.charAt(0);
66 | }
67 | },
68 | STRING {
69 | @Override
70 | public Object convert(String value) {
71 | return value;
72 | }
73 | },
74 | BOOLEAN_ARRAY {
75 | @Override
76 | public Object convert(String value) {
77 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
78 | boolean[] booleanArray = new boolean[strArray.length];
79 | for(int i = 0; i < strArray.length; i++) {
80 | booleanArray[i] = Boolean.valueOf(strArray[i]);
81 | }
82 | return booleanArray;
83 | }
84 | },
85 | INT_ARRAY {
86 | @Override
87 | public Object convert(String value) {
88 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
89 | int[] intArray = new int[strArray.length];
90 | for(int i = 0; i < strArray.length; i++) {
91 | intArray[i] = Integer.parseInt(strArray[i]);
92 | }
93 | return intArray;
94 | }
95 | },
96 | LONG_ARRAY {
97 | @Override
98 | public Object convert(String value) {
99 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
100 | long[] longArray = new long[strArray.length];
101 | for(int i = 0; i < strArray.length; i++) {
102 | longArray[i] = Long.parseLong(strArray[i]);
103 | }
104 | return longArray;
105 | }
106 | },
107 | DOUBLE_ARRAY {
108 | @Override
109 | public Object convert(String value) {
110 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
111 | double[] doubleArray = new double[strArray.length];
112 | for(int i = 0; i < strArray.length; i++) {
113 | doubleArray[i] = Double.parseDouble(strArray[i]);
114 | }
115 | return doubleArray;
116 | }
117 | },
118 | FLOAT_ARRAY {
119 | @Override
120 | public Object convert(String value) {
121 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
122 | float[] floatArray = new float[strArray.length];
123 | for(int i = 0; i < strArray.length; i++) {
124 | floatArray[i] = Float.parseFloat(strArray[i]);
125 | }
126 | return floatArray;
127 | }
128 | },
129 | BYTE_ARRAY {
130 | @Override
131 | public Object convert(String value) {
132 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
133 | byte[] byteArray = new byte[strArray.length];
134 | for(int i = 0; i < strArray.length; i++) {
135 | byteArray[i] = Byte.parseByte(strArray[i]);
136 | }
137 | return byteArray;
138 | }
139 | },
140 | SHORT_ARRAY {
141 | @Override
142 | public Object convert(String value) {
143 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
144 | short[] shortArray = new short[strArray.length];
145 | for(int i = 0; i < strArray.length; i++) {
146 | shortArray[i] = Short.parseShort(strArray[i]);
147 | }
148 | return shortArray;
149 | }
150 | },
151 | CHAR_ARRAY {
152 | @Override
153 | public Object convert(String value) {
154 | String[] strArray = value.split(Config.ARRAYS_SEPARATOR);
155 | char[] charArray = new char[strArray.length];
156 | for(int i = 0; i < strArray.length; i++) {
157 | charArray[i] = strArray[i].charAt(0);
158 | }
159 | return charArray;
160 | }
161 | },
162 | STRING_ARRAY {
163 | @Override
164 | public Object convert(String value) {
165 | String separator = Config.ARRAYS_SEPARATOR;
166 | return value.split(Config.ARRAYS_SEPARATOR);
167 | }
168 | };
169 |
170 | public static Type fromString(String typeString) {
171 | if (typeString==null || typeString.isEmpty()) return Type.STRING;
172 | try {
173 | return valueOf(typeString.toUpperCase());
174 | } catch (Exception e) {
175 | throw new IllegalArgumentException("Unknown Type "+typeString);
176 | }
177 | }
178 |
179 | public abstract Object convert(String value);
180 |
181 | public boolean isProperty() { return true; }
182 | }
183 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/index/LongIterableIndexHits.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.index;
2 |
3 | import org.neo4j.graphdb.ResourceIterator;
4 | import org.neo4j.graphdb.index.IndexHits;
5 | import org.neo4j.helpers.collection.Iterables;
6 |
7 | import java.util.Iterator;
8 |
9 | /**
10 | * @author mh
11 | * @since 11.06.13
12 | */
13 | public class LongIterableIndexHits implements IndexHits, ResourceIterator {
14 |
15 | private final Iterable values;
16 | private Iterator iterator;
17 |
18 | public LongIterableIndexHits(Iterable values) {
19 | this.values = values;
20 | iterator = iterator();
21 | }
22 |
23 | @Override
24 | public int size() {
25 | return (int) Iterables.count(values);
26 | }
27 |
28 | @Override
29 | public void close() {
30 | if (iterator instanceof ResourceIterator) {
31 | ((ResourceIterator)iterator).close();
32 | }
33 | }
34 |
35 | @Override
36 | public Long getSingle() {
37 | return Iterables.singleOrNull(values);
38 | }
39 |
40 | @Override
41 | public float currentScore() {
42 | return 0;
43 | }
44 |
45 | @Override
46 | public ResourceIterator iterator() {
47 | iterator = values.iterator();
48 | return this;
49 | }
50 |
51 |
52 | @Override
53 | public boolean hasNext() {
54 | return iterator.hasNext();
55 | }
56 |
57 | @Override
58 | public Long next() {
59 | return iterator.next();
60 | }
61 |
62 | @Override
63 | public void remove() {
64 | iterator.remove();
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/index/MapDbCachingIndexProvider.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.index;
2 |
3 | import org.mapdb.BTreeKeySerializer;
4 | import org.mapdb.Bind;
5 | import org.mapdb.DB;
6 | import org.mapdb.DBMaker;
7 | import org.mapdb.Fun;
8 | import org.neo4j.graphdb.index.IndexHits;
9 | import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
10 | import org.neo4j.unsafe.batchinsert.BatchInserter;
11 | import org.neo4j.unsafe.batchinsert.BatchInserterIndex;
12 | import org.neo4j.unsafe.batchinsert.BatchInserterIndexProvider;
13 |
14 | import java.util.HashMap;
15 | import java.util.Map;
16 | import java.util.NavigableSet;
17 |
18 | public class MapDbCachingIndexProvider implements BatchInserterIndexProvider {
19 | LuceneBatchInserterIndexProvider luceneIndex;
20 | private DB db;
21 |
22 | public MapDbCachingIndexProvider(BatchInserter inserter) {
23 | this(new LuceneBatchInserterIndexProvider(inserter));
24 | }
25 |
26 | public MapDbCachingIndexProvider(LuceneBatchInserterIndexProvider luceneIndex) {
27 | this.luceneIndex = luceneIndex;
28 | db = DBMaker.newTempFileDB().
29 | asyncFlushDelay(1000).
30 | cacheSize(1024 * 1024).
31 | closeOnJvmShutdown().
32 | deleteFilesAfterClose().
33 | syncOnCommitDisable().
34 | writeAheadLogDisable().
35 | make();
36 | }
37 |
38 | @Override
39 | public BatchInserterIndex nodeIndex(String indexName, Map config) {
40 | return new CachingBatchInserterIndex(db,indexName,luceneIndex.nodeIndex(indexName,config));
41 | }
42 |
43 | @Override
44 | public BatchInserterIndex relationshipIndex(String indexName, Map config) {
45 | return new CachingBatchInserterIndex(db,indexName,luceneIndex.relationshipIndex(indexName, config));
46 | }
47 |
48 | @Override
49 | public void shutdown() {
50 | luceneIndex.shutdown();
51 | db.close();
52 | }
53 |
54 | private static class CachingBatchInserterIndex implements BatchInserterIndex {
55 | Map>> caches = new HashMap>>();
56 | private final DB db;
57 | private final String indexName;
58 | private final BatchInserterIndex index;
59 |
60 | public CachingBatchInserterIndex(DB db, String indexName, BatchInserterIndex index) {
61 | this.db = db;
62 | this.indexName = indexName;
63 | this.index = index;
64 | }
65 | private NavigableSet> getSet(String property) {
66 | NavigableSet> set = caches.get(property);
67 | if (set != null) return set;
68 | set=db.>createTreeSet(indexName+"."+property,32,false, BTreeKeySerializer.TUPLE2,null);
69 | caches.put(property,set);
70 | return set;
71 | }
72 |
73 | @Override
74 | public void add(long entityId, Map properties) {
75 | for (Map.Entry entry : properties.entrySet()) {
76 | getSet(entry.getKey()).add(Fun.t2(entry.getValue(), entityId));
77 | }
78 | index.add(entityId,properties);
79 | }
80 |
81 | @Override
82 | public void updateOrAdd(long entityId, Map properties) {
83 | throw new UnsupportedOperationException();
84 | }
85 |
86 | @Override
87 | public IndexHits get(String key, Object value) {
88 | final Iterable values = Bind.findSecondaryKeys(getSet(key), value);
89 | return new LongIterableIndexHits(values);
90 | }
91 |
92 | @Override
93 | public IndexHits query(String key, Object queryOrQueryObject) {
94 | throw new UnsupportedOperationException();
95 | }
96 |
97 | @Override
98 | public IndexHits query(Object queryOrQueryObject) {
99 | throw new UnsupportedOperationException();
100 | }
101 |
102 | @Override
103 | public void flush() {
104 | index.flush();
105 | }
106 |
107 | @Override
108 | public void setCacheCapacity(String key, int size) {
109 | throw new UnsupportedOperationException();
110 | }
111 |
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/Chunker.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 |
6 | /**
7 | * @author mh
8 | * @since 13.11.12
9 | */
10 | public class Chunker {
11 | public static final String EOF = null;
12 | public static final String EOL = "\n".intern();
13 | public static final String NO_VALUE = "".intern();
14 | public static final char EOL_CHAR2 = '\r';
15 | public static final char EOL_CHAR = '\n';
16 | public static final char EOF_CHAR = (char)-1;
17 | public static final int PREV_EOL_CHAR = -2;
18 | private static final int BUFSIZE = 32*1024;
19 | private final Reader reader;
20 | private final char delim;
21 | private final char[] buffer=new char[BUFSIZE];
22 | private int lastChar = PREV_EOL_CHAR;
23 | private int pos=BUFSIZE;
24 |
25 | public Chunker(Reader reader, char delim) {
26 | this.reader = reader;
27 | this.delim = delim;
28 | }
29 |
30 | /**
31 | * @return the token, null for EOF, empty string for no value read (just delim) or "\n" for EOL
32 | * @throws IOException
33 | */
34 | public String nextWord() throws IOException {
35 | int count = 0;
36 | int ch;
37 | if (lastChar == EOF_CHAR) return EOF;
38 | if (lastChar == EOL_CHAR) {
39 | lastChar = PREV_EOL_CHAR;
40 | return EOL;
41 | }
42 |
43 | if (pos == BUFSIZE) {
44 | int available = reader.read(buffer);
45 | pos = 0;
46 | if (available == -1) {
47 | available = 0;
48 | }
49 | if (available < BUFSIZE) {
50 | buffer[available] = EOF_CHAR;
51 | }
52 | }
53 | int start = pos;
54 | while ((ch = buffer[pos++])!=delim && ch!= EOL_CHAR && ch!= EOF_CHAR) {
55 | count++;
56 | if (pos == BUFSIZE) {
57 | System.arraycopy(buffer, start, buffer, 0, count);
58 | int available = reader.read(buffer, count, BUFSIZE - count);
59 | pos = count;
60 | start = 0;
61 | if (available == -1) {
62 | available = 0;
63 | }
64 | if (available < BUFSIZE - count) {
65 | buffer[available + count] = EOF_CHAR;
66 | }
67 | }
68 | }
69 | if (count == 0) {
70 | if (lastChar==PREV_EOL_CHAR && ch== EOF_CHAR) { lastChar=EOF_CHAR;return EOF; }
71 | lastChar=ch;
72 | if (ch == EOF_CHAR) return NO_VALUE;
73 | if (ch == EOL_CHAR) return NO_VALUE;
74 | return NO_VALUE;
75 | }
76 | lastChar=ch;
77 | if (buffer[start + count-1]==EOL_CHAR2) count--;
78 | return String.valueOf(buffer, start, count);
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/Config.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import org.neo4j.batchimport.Importer;
4 | import org.neo4j.batchimport.IndexInfo;
5 | import org.neo4j.helpers.collection.MapUtil;
6 |
7 | import java.io.File;
8 | import java.io.FileWriter;
9 | import java.util.ArrayList;
10 | import java.util.Collection;
11 | import java.util.HashMap;
12 | import java.util.Map;
13 | import java.util.Stack;
14 |
15 | public class Config {
16 | public static final String BATCH_IMPORT_RELS_FILES = "batch_import.rels_files";
17 | public static final String BATCH_IMPORT_GRAPH_DB = "batch_import.graph_db";
18 | public static final String BATCH_IMPORT_KEEP_DB = "batch_import.keep_db";
19 | public static final String CONFIG_FILE_NAME = "batch.properties";
20 | public static final String BATCH_IMPORT_NODES_FILES = "batch_import.nodes_files";
21 | public static final String BATCH_IMPORT_MAPDB_CACHE_DISABLE = "batch_import.mapdb_cache.disable";
22 | public static final String BATCH_IMPORT_CSV_QUOTES = "batch_import.csv.quotes";
23 | public static final String BATCH_IMPORT_CSV_DELIM = "batch_import.csv.delim";
24 | public static final String ARRAY_SEPARATOR_CONFIG = "batch_array_separator";
25 | public static String ARRAYS_SEPARATOR = ",";
26 |
27 | private final Map configData;
28 |
29 | public Config(Map configData) {
30 | this.configData = configData;
31 | if (this.configData.containsKey(ARRAY_SEPARATOR_CONFIG)){
32 | Config.ARRAYS_SEPARATOR = configData.get(ARRAY_SEPARATOR_CONFIG);
33 | }
34 | }
35 |
36 | public static Config convertArgumentsToConfig(String[] args) {
37 | final Stack argumentList = toStack(args);
38 |
39 | String configFileName = findConfigFileName(argumentList);
40 |
41 | final Map config = config(configFileName);
42 |
43 | convertParamsToConfig(argumentList, config);
44 |
45 | validateConfig(config);
46 | return new Config(config);
47 | }
48 |
49 | private static Stack toStack(String[] args) {
50 | final Stack argumentList = new Stack();
51 | for (int i = args.length - 1; i >= 0; i--) {
52 | argumentList.push(args[i]);
53 | }
54 | return argumentList;
55 | }
56 |
57 | private static String findConfigFileName(Stack argumentList) {
58 | String firstParam = argumentList.isEmpty() ? "" : argumentList.peek();
59 | String configFileName = CONFIG_FILE_NAME;
60 | if (firstParam.endsWith(".properties")) {
61 | configFileName = firstParam;
62 | popOrNull(argumentList);
63 | }
64 | return configFileName;
65 | }
66 |
67 | // todo more checks ?
68 | private static void validateConfig(Map config) {
69 | if (!config.containsKey(BATCH_IMPORT_GRAPH_DB)) throw new IllegalArgumentException("Missing parameter for graphdb directory");
70 | }
71 |
72 | private static Collection convertParamsToConfig(Stack args, Map config) {
73 | addConfigParamIfArgument(args, config, BATCH_IMPORT_GRAPH_DB);
74 | addConfigParamIfArgument(args, config, BATCH_IMPORT_NODES_FILES);
75 | addConfigParamIfArgument(args, config, BATCH_IMPORT_RELS_FILES);
76 | Collection indexes = createIndexInfos(args);
77 | for (IndexInfo index : indexes) {
78 | index.addToConfig(config);
79 | }
80 | return indexes;
81 | }
82 |
83 | private static void addConfigParamIfArgument(Stack args, Map config, String param) {
84 | final String arg = popOrNull(args);
85 | if (arg==null || arg.trim().isEmpty()) return;
86 | if (!config.containsKey(param)) config.put(param, arg);
87 | }
88 |
89 | private static String popOrNull(Stack args) {
90 | if (args.isEmpty()) return null;
91 | return args.pop();
92 | }
93 |
94 | private static Collection createIndexInfos(Stack args) {
95 | Collection indexes=new ArrayList();
96 | while (!args.isEmpty()) {
97 | indexes.add(new IndexInfo(popOrNull(args), popOrNull(args), popOrNull(args), popOrNull(args)));
98 | }
99 | return indexes;
100 | }
101 |
102 | public static Map config(String fileName) {
103 | Map config = new HashMap();
104 | try {
105 | if (new File(fileName).exists()) {
106 | System.out.println("Using Existing Configuration File");
107 | } else {
108 | System.out.println("Writing Configuration File to batch.properties");
109 | FileWriter fw = new FileWriter(fileName);
110 | fw.append("use_memory_mapped_buffers=true\n"
111 | + "neostore.nodestore.db.mapped_memory=100M\n"
112 | + "neostore.relationshipstore.db.mapped_memory=500M\n"
113 | + "neostore.propertystore.db.mapped_memory=1G\n"
114 | + "neostore.propertystore.db.strings.mapped_memory=200M\n"
115 | + "neostore.propertystore.db.arrays.mapped_memory=0M\n"
116 | + "neostore.propertystore.db.index.keys.mapped_memory=15M\n"
117 | + "neostore.propertystore.db.index.mapped_memory=15M");
118 | fw.close();
119 | }
120 |
121 | config = MapUtil.load(new File(fileName));
122 |
123 | } catch (Exception e) {
124 | System.out.println(e.getMessage());
125 | }
126 | return config;
127 | }
128 |
129 | public static Collection extractIndexInfos(Map config) {
130 | Collection result=new ArrayList();
131 | for (Map.Entry entry : config.entrySet()) {
132 | final IndexInfo info = IndexInfo.fromConfigEntry(entry);
133 | if (info!=null) result.add(info);
134 | }
135 | return result;
136 | }
137 |
138 | public static boolean configOptionEnabled(Config config, String option) {
139 | return "true".equalsIgnoreCase(config.get(option));
140 | }
141 | public static boolean configOptionDisabled(Config config, String option) {
142 | return "false".equalsIgnoreCase(config.get(option));
143 | }
144 |
145 | public static Collection toFiles(String commaSeparatedFileList) {
146 | Collection files=new ArrayList();
147 | if (commaSeparatedFileList==null || commaSeparatedFileList.isEmpty()) return files;
148 | for (String part : commaSeparatedFileList.split(",")) {
149 | final File file = new File(part);
150 | if (file.exists() && file.canRead() && file.isFile()) files.add(file);
151 | else System.err.println("File "+file+" does not exist, can not be read or is not a file.");
152 | }
153 | return files;
154 | }
155 |
156 | public static String NODE_INDEX(String indexName) {
157 | return "batch_import.node_index." + indexName;
158 | }
159 | public static String RELATIONSHIP_INDEX(String indexName) {
160 | return "batch_import.relationship_index." + indexName;
161 | }
162 |
163 | public boolean isCachedIndexDisabled() {
164 | return configOptionEnabled(this, BATCH_IMPORT_MAPDB_CACHE_DISABLE);
165 | }
166 |
167 | public Collection getIndexInfos() {
168 | return extractIndexInfos(configData);
169 | }
170 |
171 | public Collection getRelsFiles() {
172 | return toFiles(get(BATCH_IMPORT_RELS_FILES));
173 | }
174 |
175 | public Collection getNodesFiles() {
176 | return toFiles(get(BATCH_IMPORT_NODES_FILES));
177 | }
178 |
179 | public char getDelimChar(Importer importer) {
180 | final String delim = get(BATCH_IMPORT_CSV_DELIM);
181 | if (delim==null || delim.isEmpty()) return '\t';
182 | return delim.trim().charAt(0);
183 | }
184 |
185 | public boolean quotesEnabled() {
186 | return !configOptionDisabled(this, BATCH_IMPORT_CSV_QUOTES);
187 | }
188 |
189 | public String getGraphDbDirectory() {
190 | return get(BATCH_IMPORT_GRAPH_DB);
191 | }
192 |
193 | String get(String option) {
194 | return configData.get(option);
195 | }
196 |
197 | public boolean keepDatabase() {
198 | return configOptionEnabled(this, BATCH_IMPORT_KEEP_DB);
199 | }
200 |
201 | public Map getConfigData() {
202 | return configData;
203 | }
204 | }
205 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/FileIterator.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import org.mapdb.Serializer;
4 |
5 | import java.io.*;
6 | import java.util.Comparator;
7 | import java.util.Iterator;
8 |
9 | class FileIterator implements Iterator {
10 | public static final char DELIM = '\t';
11 | private final BufferedReader reader;
12 | private final String file;
13 | Line line;
14 | long lineNo;
15 |
16 | public FileIterator(String file) throws FileNotFoundException {
17 | reader = new BufferedReader(new FileReader(file), RelationshipSorter.BUFFER);
18 | this.file = file;
19 | line = readLine();
20 | }
21 |
22 | public void close() throws IOException {
23 | reader.close();
24 | }
25 |
26 | private Line readLine() {
27 | try {
28 | String line = reader.readLine();
29 | if (line==null) return null;
30 | return Line.from(lineNo++, line);
31 | } catch (IOException e) {
32 | throw new RuntimeException("Error reading file "+ file,e);
33 | }
34 | }
35 |
36 |
37 | public boolean hasNext() {
38 | return line != null;
39 | }
40 |
41 | public Line next() {
42 | Line result=line;
43 | line = readLine();
44 | return result;
45 | }
46 |
47 | public void remove() {
48 | }
49 |
50 | public static class LineSerializer implements Serializer {
51 | @Override
52 | public void serialize(DataOutput dataOutput, Line line) throws IOException {
53 | dataOutput.writeLong(line.lineNo);
54 | // dataOutput.writeLong(line.min);
55 | // dataOutput.writeLong(line.max);
56 | dataOutput.writeUTF(line.line);
57 | }
58 |
59 | @Override
60 | public Line deserialize(DataInput dataInput, int i) throws IOException {
61 | // return Line.from(dataInput.readLong(),dataInput.readLong(),dataInput.readLong(),dataInput.readUTF());
62 | return Line.from(dataInput.readLong(),dataInput.readUTF());
63 | }
64 | }
65 | public static class Line {
66 | String line;
67 | long lineNo, min, max;
68 | public static Line from(long lineNo, long min, long max, String line) {
69 | Line result = new Line();
70 | result.lineNo = lineNo;
71 | result.min = min;
72 | result.max = max;
73 | result.line = line;
74 | return result;
75 | }
76 | public static Line from(long lineNo, String line) {
77 | if (lineNo > 0) {
78 | int idx = line.indexOf(DELIM);
79 | long start = Long.parseLong(line.substring(0, idx++));
80 | long end = Long.parseLong(line.substring(idx, line.indexOf(DELIM, idx)));
81 | return from(lineNo,Math.min(start,end), Math.max(start, end),line);
82 | } else {
83 | return from(lineNo,-1, -1,line);
84 | }
85 | }
86 | }
87 |
88 | public static class RelStartEndComparator implements Comparator {
89 |
90 | public int compare(Line line1, Line line2) {
91 | int result = Long.compare(line1.min, line2.min);
92 | if (result == 0) {
93 | result = Long.compare(line1.max, line2.max);
94 | if (result == 0) return Long.compare(line1.lineNo, line2.lineNo);
95 | }
96 | return result;
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/Params.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import java.io.File;
4 |
5 | /**
6 | * @author mh
7 | * @since 02.11.12
8 | */
9 | public class Params {
10 |
11 | private final String names;
12 | private final String[] args;
13 | private String[] params;
14 |
15 | public Params(String names, String... args) {
16 | this.names = names;
17 | this.params = names.split(" +");
18 | this.args = args;
19 | }
20 |
21 | public boolean invalid() {
22 | return args.length != params.length;
23 | }
24 |
25 | public int length() {
26 | return params.length;
27 | }
28 |
29 | @Override
30 | public String toString() {
31 | return names;
32 | }
33 |
34 | public File file(String name) {
35 | return new File(string(name));
36 | }
37 |
38 | public String string(String name) {
39 | for (int i = 0; i < params.length; i++) {
40 | if (params[i].equalsIgnoreCase(name)) {
41 | return args[i];
42 | }
43 | }
44 | throw new IllegalArgumentException("Invalid name" + name + " only know " + names);
45 | }
46 |
47 | public long longValue(String name) {
48 | return Long.parseLong(string(name));
49 | }
50 |
51 | public int intValue(String name) {
52 | return Integer.parseInt(string(name));
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/RelationshipSorter.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import org.mapdb.Pump;
4 | import org.neo4j.helpers.collection.IteratorWrapper;
5 |
6 | import java.io.BufferedWriter;
7 | import java.io.FileWriter;
8 | import java.io.IOException;
9 | import java.util.Iterator;
10 |
11 | /**
12 | * @author Michael Hunger @since 04.11.13
13 | */
14 | public class RelationshipSorter {
15 |
16 | public static final int BUFFER = 1024 * 1024;
17 |
18 | public static void main(String... args) throws IOException {
19 | System.err.println("Usage mvn exec:java -Dexec.mainClass=org.neo4j.batchimport.utils.RelationshipSorter -Dexec.args='rels.csv rels_sorted.csv'");
20 | final String file = args[0];
21 | String file2 = args[1];
22 | long time = System.currentTimeMillis();
23 | FileIterator reader0 = new FileIterator(file);
24 | Iterator reader = wrapStatistics(reader0);
25 | FileIterator.Line header = reader.next();
26 | Iterator result = Pump.sort(reader, 1_000_000, new FileIterator.RelStartEndComparator(), new FileIterator.LineSerializer());
27 | BufferedWriter writer = new BufferedWriter(new FileWriter(file2), BUFFER);
28 | writer.write(header.line);
29 | writer.write("\n");
30 | long count = 0;
31 | while (result.hasNext()) {
32 | writer.write(result.next().line);
33 | writer.write('\n');
34 | count++;
35 | }
36 | writer.close();
37 | reader0.close();
38 | System.out.println("sorting " + count + " lines took " + (System.currentTimeMillis()-time)/1000+" seconds");
39 | }
40 |
41 | private static Iterator wrapStatistics(final FileIterator reader0) {
42 | return new IteratorWrapper(reader0) {
43 | long time = System.currentTimeMillis();
44 | @Override
45 | protected FileIterator.Line underlyingObjectToObject(FileIterator.Line line) {
46 | if (line.lineNo % 10000 == 0) {
47 | System.out.print(".");
48 | if (line.lineNo % 1000000 == 0) {
49 | long now = System.currentTimeMillis();
50 | System.out.println(" "+line.lineNo+ " " +(now - time)+" ms");
51 | time = now;
52 | }
53 | }
54 |
55 | return line;
56 | }
57 | };
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/org/neo4j/batchimport/utils/RelationshipSorter2.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport.utils;
2 |
3 | import org.neo4j.helpers.collection.ArrayIterator;
4 | import org.neo4j.helpers.collection.IteratorWrapper;
5 |
6 | import java.io.BufferedWriter;
7 | import java.io.FileWriter;
8 | import java.io.IOException;
9 | import java.util.Arrays;
10 | import java.util.Iterator;
11 |
12 | /**
13 | * @author Michael Hunger @since 04.11.13
14 | */
15 | public class RelationshipSorter2 {
16 |
17 | public static final char DELIM = '\t';
18 | public static final int BUFFER = 1024 * 1024;
19 | public static final FileIterator.RelStartEndComparator COMPARATOR = new FileIterator.RelStartEndComparator();
20 | public static final int ARRAY_BUFFER = 10_000_000;
21 |
22 | public static void main(String... args) throws IOException {
23 | System.err.println("Usage mvn exec:java -Dexec.mainClass=org.neo4j.batchimport.utils.RelationshipSorter2 -Dexec.args='rels.csv rels_sorted.csv'");
24 | final String file = args[0];
25 | String file2 = args[1];
26 | long time = System.currentTimeMillis();
27 | FileIterator reader0 = new FileIterator(file);
28 | Iterator reader = wrapStatistics(reader0);
29 | FileIterator.Line header = reader.next();
30 | FileIterator.Line[] lines = new FileIterator.Line[ARRAY_BUFFER];
31 | int read = readArray(reader, lines);
32 | Arrays.sort(lines, COMPARATOR);
33 | long count = writeFile(file2, lines, read);
34 | // Iterator result = new ArrayIterator<>(lines);
35 | // sort array
36 | // long count = writeFile(file2, header, result);
37 | reader0.close();
38 | System.out.println("sorting " + count + " lines took " + (System.currentTimeMillis()-time)/1000+" seconds");
39 | }
40 |
41 | private static long writeFile(String file, FileIterator.Line header, Iterator lines) throws IOException {
42 | BufferedWriter writer = new BufferedWriter(new FileWriter(file), BUFFER);
43 | if (header!=null) {
44 | writer.write(header.line); writer.write("\n");
45 | }
46 | long count = 0;
47 | while (lines.hasNext()) {
48 | writer.write(lines.next().line); writer.write('\n');
49 | count++;
50 | }
51 | writer.close();
52 | return count;
53 | }
54 |
55 | private static long writeFile(String file, FileIterator.Line[] lines, int count) throws IOException {
56 | BufferedWriter writer = new BufferedWriter(new FileWriter(file), BUFFER);
57 | for (int i = 0; i < count; i++) {
58 | writer.write(lines[i].line); writer.write('\n');
59 | }
60 | writer.close();
61 | return count;
62 | }
63 |
64 | private static int readArray(Iterator reader, FileIterator.Line[] lines) {
65 | int i=0;
66 | int length = lines.length;
67 | while (i < length && reader.hasNext()) {
68 | lines[i++] = reader.next();
69 | }
70 | return i;
71 | }
72 |
73 | private static Iterator wrapStatistics(final FileIterator reader0) {
74 | return new IteratorWrapper(reader0) {
75 | long time = System.currentTimeMillis();
76 | @Override
77 | protected FileIterator.Line underlyingObjectToObject(FileIterator.Line line) {
78 | if (line.lineNo % 10000 == 0) {
79 | System.out.print(".");
80 | if (line.lineNo % 1000000 == 0) {
81 | long now = System.currentTimeMillis();
82 | System.out.println(" "+line.lineNo+ " " +(now - time)+" ms");
83 | time = now;
84 | }
85 | }
86 |
87 | return line;
88 | }
89 | };
90 | }
91 |
92 | }
93 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | #
3 | # Logging Configuration
4 | #
5 | # ------------------------------------------------------------------------
6 | #
7 | log4j.rootLogger=INFO, Console
8 |
9 | ########################################################################
10 | #
11 | # Logfile definitions
12 | #
13 | ########################################################################
14 | #Console Log
15 | log4j.appender.Console=org.apache.log4j.ConsoleAppender
16 | log4j.appender.Console.Threshold=DEBUG
17 | log4j.appender.Console.layout=org.apache.log4j.PatternLayout
18 | log4j.appender.Console.layout.ConversionPattern=%-5p %C{1} - %m\n
19 | log4j.appender.Console.Target=System.err
20 |
21 | #LOGTXT Log
22 | log4j.appender.LOGTXT=org.apache.log4j.FileAppender
23 | log4j.appender.LOGTXT.File=log.txt
24 | log4j.appender.LOGTXT.Append=false
25 | log4j.appender.LOGTXT.Threshold=DEBUG
26 | log4j.appender.LOGTXT.layout=org.apache.log4j.PatternLayout
27 | log4j.appender.LOGTXT.layout.ConversionPattern=%-5p %C{1} - %m\n
28 |
29 |
--------------------------------------------------------------------------------
/src/test/java/DataTest.java:
--------------------------------------------------------------------------------
1 | import org.junit.Assert;
2 | import org.junit.Test;
3 | import org.neo4j.batchimport.importer.RowData;
4 |
5 | import java.util.Arrays;
6 | import java.util.Map;
7 |
8 | import static org.junit.Assert.assertArrayEquals;
9 | import static org.junit.Assert.assertEquals;
10 |
11 | public class DataTest {
12 | @Test
13 | public void testConvertType() throws Exception {
14 | RowData data = new RowData("a:int\tb:float\tc:float", "\t", 0);
15 | data.processLine("100\t100.0\t1E+10");
16 | Map row = data.getProperties();
17 | assertEquals(100, row.get("a"));
18 | assertEquals(true,row.get("b") instanceof Float);
19 | assertEquals(100.0F, row.get("b"));
20 | assertEquals(true,row.get("b") instanceof Float);
21 | assertEquals(100.0F, row.get("b"));
22 | assertEquals(true,row.get("c") instanceof Float);
23 | assertEquals(1E+10F, row.get("c"));
24 | }
25 |
26 | @Test
27 | public void testRelationship() throws Exception {
28 | RowData data = new RowData("start\tend\ttype\tproperty", "\t", 3);
29 | data.processLine("1\t2\tTYPE\tPROPERTY");
30 | Map row = data.getProperties();
31 | assertEquals("1", data.getValue(0));
32 | assertEquals("2", data.getValue(1));
33 | assertEquals("TYPE", data.getTypeLabels()[0]);
34 | assertEquals("PROPERTY", row.get("property"));
35 | }
36 |
37 | @Test
38 | public void testRelationshipWithNoProperty() throws Exception {
39 | RowData data = new RowData("start\tend\ttype", "\t", 3);
40 | data.processLine("1\t2\tTYPE");
41 | assertEquals("1", data.getValue(0));
42 | assertEquals("2", data.getValue(1));
43 | assertEquals("TYPE", data.getTypeLabels()[0]);
44 | }
45 |
46 | @Test
47 | public void testNodeLabels() throws Exception {
48 | RowData data = new RowData("labels", "\t", 3);
49 | data.processLine("TYPE1,TYPE2");
50 | assertEquals("TYPE1", data.getTypeLabels()[0]);
51 | assertEquals("TYPE2", data.getTypeLabels()[1]);
52 | }
53 | @Test
54 | public void testNodeLabelsWithLabelType() throws Exception {
55 | RowData data = new RowData("foo:label", "\t", 3);
56 | data.processLine("TYPE1,TYPE2");
57 | assertEquals("TYPE1", data.getTypeLabels()[0]);
58 | assertEquals("TYPE2", data.getTypeLabels()[1]);
59 | }
60 | @Test
61 | public void testArrayType() throws Exception {
62 | RowData data = new RowData("a:int\tb:float\tc:string_array", "\t", 0);
63 | data.processLine("100\t100.0\tbagels,coffee,tea");
64 | Map row = data.getProperties();
65 | assertEquals(100, row.get("a"));
66 | assertEquals(true,row.get("b") instanceof Float);
67 | assertEquals(100.0F, row.get("b"));
68 | assertEquals(true,row.get("b") instanceof Float);
69 | assertEquals(100.0F, row.get("b"));
70 | assertEquals(true,row.get("c") instanceof String[]);
71 | assertArrayEquals(new String[]{"bagels", "coffee", "tea"}, (String[]) row.get("c"));
72 | }
73 |
74 | @Test
75 | public void testBooleanArrayType() throws Exception {
76 | RowData data = new RowData("a:int\tb:float\tc:boolean_array", "\t", 0);
77 | data.processLine("100\t100.0\ttrue,false,true");
78 | Map row = data.getProperties();
79 | assertEquals(100, row.get("a"));
80 | assertEquals(true,row.get("b") instanceof Float);
81 | assertEquals(100.0F, row.get("b"));
82 | assertEquals(true,row.get("b") instanceof Float);
83 | assertEquals(100.0F, row.get("b"));
84 | assertEquals(true,row.get("c") instanceof boolean[]);
85 | Assert.assertTrue(Arrays.equals(new boolean[]{true, false, true}, (boolean[]) row.get("c")));
86 | }
87 | @Test
88 | public void testIntArrayType() throws Exception {
89 | RowData data = new RowData("a:int\tb:float\tc:int_array", "\t", 0);
90 | data.processLine("100\t100.0\t1,2,3");
91 | Map row = data.getProperties();
92 | assertEquals(100, row.get("a"));
93 | assertEquals(true,row.get("b") instanceof Float);
94 | assertEquals(100.0F, row.get("b"));
95 | assertEquals(true,row.get("b") instanceof Float);
96 | assertEquals(100.0F, row.get("b"));
97 | assertEquals(true,row.get("c") instanceof int[]);
98 | assertArrayEquals(new int[] {1,2,3}, (int[])row.get("c"));
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/test/java/org/neo4j/batchimport/ImporterIntegrationTest.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import org.junit.Test;
4 | import org.neo4j.consistency.ConsistencyCheckTool;
5 | import org.neo4j.graphdb.GraphDatabaseService;
6 | import org.neo4j.graphdb.Node;
7 | import org.neo4j.graphdb.Transaction;
8 | import org.neo4j.graphdb.factory.GraphDatabaseFactory;
9 | import org.neo4j.io.fs.FileUtils;
10 |
11 | import java.io.File;
12 | import java.io.FileWriter;
13 |
14 | import static org.junit.Assert.assertTrue;
15 |
16 | /**
17 | * @author Michael Hunger @since 05.11.13
18 | */
19 | public class ImporterIntegrationTest {
20 |
21 | public static final String DB_DIRECTORY = "target/index-reuse.db";
22 |
23 | @Test
24 | public void testMain() throws Exception {
25 | FileUtils.deleteRecursively(new File(DB_DIRECTORY));
26 | TestDataGenerator.main("1000","10","A,B,C","sorted");
27 | Importer.main(DB_DIRECTORY,"nodes.csv","rels.csv");
28 | ConsistencyCheckTool.main(new String[]{DB_DIRECTORY});
29 | }
30 |
31 | @Test
32 | public void testImportHashes() throws Exception {
33 | FileUtils.deleteRecursively(new File(DB_DIRECTORY));
34 | FileWriter writer = new FileWriter("target/hashes.csv");
35 | writer.write("a\n000000F8BE951D6DE6480F4AFDFB670C553E47C0\r\n0000021449360C1A398ED9A18800B2B13AA098A4\r\n00000DABDE4C555FC82F7D534835247B94873C2C\r\n00001BE4128DB41729365A41D3AC1D019E5ED8A6\r\n");
36 | writer.close();
37 | Importer.main(DB_DIRECTORY,"target/hashes.csv");
38 | ConsistencyCheckTool.main(new String[]{DB_DIRECTORY});
39 | GraphDatabaseService db = new GraphDatabaseFactory().newEmbeddedDatabase(new File(DB_DIRECTORY));
40 | try (Transaction tx = db.beginTx()) {
41 | for (Node node : db.getAllNodes()) {
42 | Object value = node.getProperty("a", null);
43 | System.out.println("value = " + value);
44 | assertTrue(value != null);
45 | }
46 | tx.success();
47 | }
48 | db.shutdown();
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/test/java/org/neo4j/batchimport/ImporterTest.java:
--------------------------------------------------------------------------------
1 | package org.neo4j.batchimport;
2 |
3 | import org.junit.Before;
4 | import org.junit.Test;
5 | import org.mockito.Matchers;
6 | import org.neo4j.batchimport.index.LongIterableIndexHits;
7 | import org.neo4j.batchimport.utils.Config;
8 | import org.neo4j.graphdb.DynamicLabel;
9 | import org.neo4j.graphdb.Label;
10 | import org.neo4j.graphdb.RelationshipType;
11 | import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
12 | import org.neo4j.unsafe.batchinsert.BatchInserter;
13 | import org.neo4j.unsafe.batchinsert.BatchInserterIndex;
14 | import org.neo4j.unsafe.batchinsert.BatchInserterIndexProvider;
15 |
16 | import java.io.File;
17 | import java.io.StringReader;
18 | import java.util.Arrays;
19 | import java.util.Map;
20 |
21 | import static java.util.Arrays.*;
22 | import org.junit.Assert;
23 | import org.mockito.ArgumentCaptor;
24 | import static org.mockito.Matchers.*;
25 | import static org.mockito.Mockito.*;
26 | import static org.neo4j.helpers.collection.MapUtil.map;
27 |
28 | public class ImporterTest {
29 |
30 | private BatchInserter inserter;
31 | private LuceneBatchInserterIndexProvider provider;
32 | private Importer importer;
33 | private BatchInserterIndex index;
34 |
35 | @Before
36 | public void setUp() throws Exception {
37 | inserter = mock(BatchInserter.class);
38 | provider = mock(LuceneBatchInserterIndexProvider.class);
39 | index = mock(BatchInserterIndex.class);
40 | when(provider.nodeIndex(eq("index-a"),anyMap())).thenReturn(index);
41 |
42 | final Map configData = Config.config("batch.properties");
43 | new IndexInfo("node_index", "index-a", "exact", null).addToConfig(configData);
44 | importer = new Importer(File.createTempFile("test", "db"), new Config(configData)) {
45 | @Override
46 | protected BatchInserter createBatchInserter(File graphDb, Config config) {
47 | return inserter;
48 | }
49 |
50 | @Override
51 | protected BatchInserterIndexProvider createIndexProvider(boolean luceneOnlyIndex) {
52 | return provider;
53 | }
54 | };
55 | }
56 |
57 | @Test
58 | public void testImportSimpleNode() throws Exception {
59 | importer.importNodes(new StringReader("a\nfoo"));
60 | importer.finish();
61 | verify(inserter, times(1)).createNode(eq(map("a", "foo")));
62 | }
63 |
64 | @Test
65 | public void testImportHashes() throws Exception {
66 | importer.importNodes(new StringReader("a\n000000F8BE951D6DE6480F4AFDFB670C553E47C0\n0000021449360C1A398ED9A18800B2B13AA098A4\n00000DABDE4C555FC82F7D534835247B94873C2C\n00001BE4128DB41729365A41D3AC1D019E5ED8A6\n"));
67 | importer.finish();
68 | verify(inserter, times(1)).createNode(eq(map("a", "000000F8BE951D6DE6480F4AFDFB670C553E47C0")));
69 | verify(inserter, times(1)).createNode(eq(map("a", "0000021449360C1A398ED9A18800B2B13AA098A4")));
70 | verify(inserter, times(1)).createNode(eq(map("a", "00000DABDE4C555FC82F7D534835247B94873C2C")));
71 | verify(inserter, times(1)).createNode(eq(map("a", "00001BE4128DB41729365A41D3AC1D019E5ED8A6")));
72 | }
73 |
74 | @Test
75 | public void testImportSimpleNodeWithId() throws Exception {
76 | importer.importNodes(new StringReader("i:id\ta\n123\tfoo"));
77 | importer.finish();
78 | verify(inserter, times(1)).createNode(eq(123L),eq(map("a", "foo")));
79 | }
80 |
81 | @Test
82 | public void testImportNodeWithNoLabel() throws Exception {
83 | importer.importNodes(new StringReader("a\t:label\nfoo\t"));
84 | importer.finish();
85 | verify(inserter, times(1)).createNode(eq(map("a", "foo")));
86 | }
87 | @Test
88 | public void testImportNodeWithLabel() throws Exception {
89 | importer.importNodes(new StringReader("a\t:label\nfoo\tbar"));
90 | importer.finish();
91 | verify(inserter, times(1)).createNode(eq(map("a", "foo")),eq(DynamicLabel.label("bar")));
92 | }
93 |
94 | @Test
95 | public void testImportNodeWithTwoLabels() throws Exception {
96 | importer.importNodes(new StringReader("a\t:label\nfoo\tbar,bor"));
97 | importer.finish();
98 | verify(inserter, times(1)).createNode(eq(map("a", "foo")),eq(DynamicLabel.label("bar")),eq(DynamicLabel.label("bor")));
99 | }
100 |
101 | @Test
102 | public void testImportSimpleNodeWithNewlineAtEnd() throws Exception {
103 | importer.importNodes(new StringReader("a\nfoo\n"));
104 | importer.finish();
105 | verify(inserter, times(1)).createNode(eq(map("a", "foo")));
106 | }
107 | @Test
108 | public void testImportSimpleNodeWithUmlauts() throws Exception {
109 | importer.importNodes(new StringReader("ö\näáß"));
110 | importer.finish();
111 | verify(inserter, times(1)).createNode(eq(map("ö", "äáß")));
112 | }
113 | @Test
114 | public void testImportNodeWithMultipleProps() throws Exception {
115 | importer.importNodes(new StringReader("a\tb\nfoo\tbar"));
116 | importer.finish();
117 | verify(inserter, times(1)).createNode(eq(map("a", "foo","b","bar")));
118 | }
119 | @Test
120 | public void testImportNodeWithIndex() throws Exception {
121 | importer.importNodes(new StringReader("a:string:index-a\tb\nfoo\tbar"));
122 | importer.finish();
123 | verify(inserter, times(1)).createNode(eq(map("a", "foo", "b", "bar")));
124 | verify(index, times(1)).add(eq(0L), eq(map("a", "foo")));
125 | }
126 |
127 | @Test
128 | public void testImportRelWithIndexLookup() throws Exception {
129 | when(index.get("a","foo")).thenReturn(new LongIterableIndexHits(asList(42L)));
130 | importer.importRelationships(new StringReader("a:string:index-a\tb\tTYPE\nfoo\t123\tFOOBAR"));
131 | importer.finish();
132 | verify(index, times(1)).get(eq("a"), eq("foo"));
133 | verify(inserter, times(1)).createRelationship(eq(42L), eq(123L), Matchers.any(RelationshipType.class),eq(map()));
134 | }
135 |
136 | @Test
137 | public void testImportRelationshipsWithNonIndexedNodes() throws Exception {
138 | when(index.get("node","a")).thenReturn(new LongIterableIndexHits(asList(1L)));
139 | when(index.get("node","b")).thenReturn(new LongIterableIndexHits(Arrays.asList()));
140 | importer.importRelationships(new StringReader("node:string:index-a\tnode:string:index-a\ttype\na\ta\tTYPE\na\tb\tTYPE\nb\ta\tTYPE"));
141 | importer.finish();
142 | verify(inserter, times(1)).createRelationship(eq(1L), eq(1L), argThat(new RelationshipMatcher("TYPE")),eq(map()));
143 | verify(inserter, never()).createRelationship(eq(1L), eq(-1L), argThat(new RelationshipMatcher("TYPE")),eq(map()));
144 | verify(inserter, never()).createRelationship(eq(-1L), eq(1L), argThat(new RelationshipMatcher("TYPE")),eq(map()));
145 | }
146 |
147 | @Test
148 | public void testImportNodeWithIndividualTypes() throws Exception {
149 | importer.importNodes(new StringReader("a:int\tb:float\tc:float\n10\t10.0\t1E+10"));
150 | importer.finish();
151 | verify(inserter, times(1)).createNode(eq(map("a", 10,"b",10.0F,"c",1E+10F)));
152 | }
153 |
154 | @Test
155 | public void testImportNodeWithArrayTypes() throws Exception {
156 | importer.importNodes(new StringReader("a:STRING_ARRAY\tb:float\tc:float\n10,11,12\t10.0\t1E+10"));
157 | importer.finish();
158 | String[] expectedArray = {"10","11","12"};
159 | ArgumentCaptor