├── .gitignore ├── LICENSE ├── README.md ├── load.cql └── query.cql /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.csv 3 | *.Rhistory 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Nicole 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # consumer_complaints 2 | 3 | This code accompanies the webinar [Using LOAD CSV in the Real World](http://watch.neo4j.org/video/112447027). 4 | 5 | ## Setup 6 | 7 | * Download `Consumer_Complaints.csv` [here](http://catalog.data.gov/dataset/consumer-complaint-database). Note that your .csv file might have more rows than mine did; they appear to update the data regularly. 8 | 9 | * Find the arrows tool [here](http://www.apcjones.com/arrows/#). 10 | 11 | * [Sublime Text 2](http://www.sublimetext.com/2) (the editor used in the webinar) along with the [Cypher plugin](https://github.com/kollhof/sublime-cypher). 12 | 13 | ## Import 14 | 15 | - Change line 1 of `load.cql` to point to the location of your `Consumer_Complaints.csv` file. 16 | - OSX and Unix: `file:///path/to/Consumer_Complaints.csv` 17 | - Windows: `file:C:/path/to/Consumer_Complaints.csv` 18 | 19 | - Send `load.cql` to the `neo4j-shell`: `./bin/neo4j-shell -file load.cql` 20 | 21 | **Important** 22 | Note that this import was done on a 16GB machine. If you have less RAM and particularly if you are on Windows, please see these blog posts: 23 | 24 | * http://jexp.de/blog/2014/06/load-csv-into-neo4j-quickly-and-successfully/ 25 | * http://www.markhneedham.com/blog/2014/10/23/neo4j-cypher-avoiding-the-eager/ 26 | * http://jexp.de/blog/2014/10/load-cvs-with-success/ 27 | 28 | Also note that in the original webinar, I split `Consumer_Complaints.csv` into separate files to deal with rows with empty strings. Since then, I've modified the script to only use `Consumer_Complaints.csv` but with empty strings filtered out for the subissues and subproducts: 29 | 30 | ``` 31 | FROM {FILEPATH} AS line 32 | WITH line WHERE line.`Sub-issue` <> '' 33 | ``` 34 | 35 | ``` 36 | FROM {FILEPATH} AS line 37 | WITH line WHERE line.`Sub-product` <> '' 38 | ``` 39 | 40 | ## Query 41 | 42 | Run all the example queries: 43 | 44 | ``` 45 | ./bin/neo4j-shell -file query.cql 46 | ``` 47 | -------------------------------------------------------------------------------- /load.cql: -------------------------------------------------------------------------------- 1 | export FILEPATH=file:///Users/nicolewhite/Consumer_Complaints.csv 2 | 3 | // Complaints, companies, responses. 4 | 5 | // Uniqueness constraints. 6 | CREATE CONSTRAINT ON (c:Complaint) ASSERT c.id IS UNIQUE; 7 | CREATE CONSTRAINT ON (c:Company) ASSERT c.name IS UNIQUE; 8 | CREATE CONSTRAINT ON (r:Response) ASSERT r.name IS UNIQUE; 9 | 10 | // Load. 11 | USING PERIODIC COMMIT 12 | LOAD CSV WITH HEADERS 13 | FROM {FILEPATH} AS line 14 | WITH DISTINCT line, SPLIT(line.`Date received`, '/') AS date 15 | WHERE line.`Company response to consumer` IS NOT NULL AND 16 | line.Company IS NOT NULL 17 | 18 | CREATE (complaint:Complaint { id: TOINT(line.`Complaint ID`) }) 19 | SET complaint.year = TOINT(date[2]), 20 | complaint.month = TOINT(date[0]), 21 | complaint.day = TOINT(date[1]) 22 | 23 | MERGE (company:Company { name: UPPER(line.Company) }) 24 | MERGE (response:Response { name: UPPER(line.`Company response to consumer`) }) 25 | 26 | CREATE (complaint)-[:AGAINST]->(company) 27 | CREATE (response)-[r:TO]->(complaint) 28 | 29 | SET r.timely = CASE line.`Timely response?` WHEN 'Yes' THEN true ELSE false END, 30 | r.disputed = CASE line.`Consumer disputed?` WHEN 'Yes' THEN true ELSE false END 31 | ; 32 | 33 | // Products, issues. 34 | 35 | // Uniqueness constraints. 36 | CREATE CONSTRAINT ON (p:Product) ASSERT p.name IS UNIQUE; 37 | CREATE CONSTRAINT ON (i:Issue) ASSERT i.name IS UNIQUE; 38 | 39 | // Load. 40 | USING PERIODIC COMMIT 41 | LOAD CSV WITH HEADERS 42 | FROM {FILEPATH} AS line 43 | WITH line 44 | WHERE line.Product IS NOT NULL AND 45 | line.Issue IS NOT NULL 46 | 47 | MATCH (complaint:Complaint { id: TOINT(line.`Complaint ID`) }) 48 | 49 | MERGE (product:Product { name: UPPER(line.Product) }) 50 | MERGE (issue:Issue {name: UPPER(line.Issue) }) 51 | 52 | CREATE (complaint)-[:ABOUT]->(product) 53 | CREATE (complaint)-[:WITH]->(issue) 54 | ; 55 | 56 | // Sub issues, sub products. 57 | 58 | // Uniqueness constraints. 59 | CREATE CONSTRAINT ON (s:SubProduct) ASSERT s.name IS UNIQUE; 60 | CREATE CONSTRAINT ON (s:SubIssue) ASSERT s.name IS UNIQUE; 61 | 62 | // Load. 63 | USING PERIODIC COMMIT 64 | LOAD CSV WITH HEADERS 65 | FROM {FILEPATH} AS line 66 | WITH line 67 | WHERE line.`Sub-issue` <> '' AND 68 | line.`Sub-issue` IS NOT NULL 69 | 70 | MATCH (complaint:Complaint { id: TOINT(line.`Complaint ID`) }) 71 | MATCH (complaint)-[:WITH]->(issue:Issue) 72 | 73 | MERGE (subIssue:SubIssue { name: UPPER(line.`Sub-issue`) }) 74 | MERGE (subIssue)-[:IN_CATEGORY]->(issue) 75 | CREATE (complaint)-[:WITH]->(subIssue) 76 | ; 77 | 78 | USING PERIODIC COMMIT 79 | LOAD CSV WITH HEADERS 80 | FROM {FILEPATH} AS line 81 | WITH line 82 | WHERE line.`Sub-product` <> '' AND 83 | line.`Sub-product` IS NOT NULL 84 | 85 | MATCH (complaint:Complaint { id: TOINT(line.`Complaint ID`) }) 86 | MATCH (complaint)-[:ABOUT]->(product:Product) 87 | 88 | MERGE (subProduct:SubProduct { name: UPPER(line.`Sub-product`) }) 89 | MERGE (subProduct)-[:IN_CATEGORY]->(product) 90 | CREATE (complaint)-[:ABOUT]->(subProduct) 91 | ; 92 | -------------------------------------------------------------------------------- /query.cql: -------------------------------------------------------------------------------- 1 | // Top types of responses that are disputed. 2 | MATCH (r:Response)-[:TO {disputed:true}]->(:Complaint) 3 | RETURN r.name AS response, COUNT(*) AS count 4 | ORDER BY count DESC; 5 | 6 | // Companies with the most disputed responses. 7 | MATCH (:Response)-[:TO {disputed:true}]->(complaint:Complaint) 8 | MATCH (complaint)-[:AGAINST]->(company:Company) 9 | RETURN company.name AS company, COUNT(*) AS count 10 | ORDER BY count DESC 11 | LIMIT 10; 12 | 13 | // All issues. 14 | MATCH (i:Issue) 15 | RETURN i.name AS issue 16 | ORDER BY issue; 17 | 18 | // All sub-issues within the 'communication tactics' issue. 19 | MATCH (i:Issue {name:'COMMUNICATION TACTICS'}) 20 | MATCH (sub:SubIssue)-[:IN_CATEGORY]->(i) 21 | RETURN sub.name AS subissue 22 | ORDER BY subissue; 23 | 24 | // Top products and sub-products associated with the obscene / abusive language sub-issue. 25 | MATCH (subIssue:SubIssue {name:'USED OBSCENE/PROFANE/ABUSIVE LANGUAGE'}) 26 | MATCH (complaint:Complaint)-[:WITH]->(subIssue) 27 | MATCH (complaint)-[:ABOUT]->(p:Product) 28 | OPTIONAL MATCH (complaint)-[:ABOUT]->(sub:SubProduct) 29 | RETURN p.name AS product, sub.name AS subproduct, COUNT(*) AS count 30 | ORDER BY count DESC; 31 | 32 | // Top company associated with the obscene / abusive language sub-issue. 33 | MATCH (subIssue:SubIssue {name:'USED OBSCENE/PROFANE/ABUSIVE LANGUAGE'}) 34 | MATCH (complaint:Complaint)-[:WITH]->(subIssue) 35 | MATCH (complaint)-[:AGAINST]->(company:Company) 36 | RETURN company.name AS company, COUNT(*) AS count 37 | ORDER BY count DESC 38 | LIMIT 10; 39 | 40 | // Top product, issue combinations with disputed responses at Wells Fargo. 41 | MATCH (wf:Company {name:'WELLS FARGO'}) 42 | MATCH (complaint:Complaint)-[:AGAINST]->(wf) 43 | MATCH (:Response)-[:TO {disputed:true}]->(complaint) 44 | MATCH (complaint)-[:ABOUT]->(p:Product) 45 | MATCH (complaint)-[:WITH]->(i:Issue) 46 | RETURN p.name AS product, i.name AS issue, COUNT(*) AS count 47 | ORDER BY count DESC; 48 | 49 | // Sub-products that belong to multiple product categories. 50 | MATCH (sub:SubProduct)-[:IN_CATEGORY]->(p:Product) 51 | WITH sub, COLLECT(p) AS products 52 | WHERE LENGTH(products) > 1 53 | RETURN sub, products; 54 | 55 | // Sub-issues that belong to multiple issue categories. 56 | MATCH (sub:SubIssue)-[:IN_CATEGORY]->(i:Issue) 57 | WITH sub, COLLECT(i) AS issues 58 | WHERE LENGTH(issues) > 1 59 | RETURN sub, issues; 60 | --------------------------------------------------------------------------------