├── readme.md
├── .gitignore
├── data
    ├── example-data.json
    ├── example-data-solr.json
    └── example-data-solr-for-faceting.json
└── scripts
    ├── convert_data2solrjson.py
    └── convert_data2solrjson_for_faceting.py


/readme.md:
--------------------------------------------------------------------------------
 1 | These files accompany my blog post on nested document handling capabilities of Solr 5.3.1 and 5.5.0:
 2 | https://medium.com/@alisazhila/solr-s-nesting-on-solr-s-capabilities-to-handle-deeply-nested-document-structures-50eeaaa4347a#.90xb5dqo8
 3 | 
 4 | ### Script usage:
 5 | 
 6 | ```{bash}
 7 | $ python ./scripts/convert_data2solrjson.py -i ./data/example-data.json -o ./data/example-data-solr.json
 8 | 
 9 | $ python ./scripts/convert_data2solrjson_for_faceting.py -i ./data/example-data.json -o ./data/example-data-solr-for-faceting.json
10 | ```
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/data/example-data.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "blog-posts":[
  3 |         {
  4 |             "date":"2015-04-10T9:00:00Z",
  5 |             "author":"Alice",
  6 |             "title":{
  7 |                 "text":"My Post #1: About Solr and Other Search Engines",
  8 |                 "keywords":[
  9 |                     {
 10 |                         "text":"Solr",
 11 |                         "type":"search engine"
 12 |                     },
 13 |                     {
 14 |                         "text":"Search Engine",
 15 |                         "type":"entity"
 16 |                     }
 17 |                 ]
 18 |             },
 19 |             "body":{
 20 |                 "text":"Here I write how useful Solr is...",
 21 |                 "keywords":[
 22 |                     {
 23 |                         "text":"Solr",
 24 |                         "type":"search engine"
 25 |                     }
 26 |                 ]
 27 |             },
 28 |             "comments":[
 29 |                 {
 30 |                     "date":"2015-04-10T11:30:00Z",
 31 |                     "author":"Bob",
 32 |                     "text":"Great post about Solr",
 33 |                     "keywords":[
 34 |                         {
 35 |                             "text":"Solr",
 36 |                             "type":"search engine"
 37 |                         }
 38 |                     ],
 39 |                     "sentiment":"positive",
 40 |                     "replies":[
 41 |                         {
 42 |                             "date":"2015-04-10T12:00:00Z",
 43 |                             "author":"Dave",
 44 |                             "text":"Yeah, I like Solr too",
 45 |                             "keywords":[
 46 |                                 {
 47 |                                     "text":"Solr",
 48 |                                     "type":"search engine"
 49 |                                 }
 50 |                             ],
 51 |                             "sentiment":"positive"
 52 |                         },
 53 |                         {
 54 |                             "date":"2015-04-12T05:00:00Z",
 55 |                             "author":"Sri",
 56 |                             "text":"I disagree, I prefer Elasticsearch",
 57 |                             "keywords":[
 58 |                                 {
 59 |                                     "text":"Elasticsearch",
 60 |                                     "type":"search engine"
 61 |                                 }
 62 |                             ],
 63 |                             "sentiment":"negative"
 64 |                         }
 65 |                     ]
 66 |                 }
 67 |             ]
 68 |         },
 69 |         {
 70 |             "date":"2015-11-10T9:00:00Z",
 71 |             "author":"Aadit",
 72 |             "title":{
 73 |                 "text":"About useful features of Solr",
 74 |                 "keywords":[
 75 |                     {
 76 |                         "text":"Solr",
 77 |                         "type":"search engine"
 78 |                     },
 79 |                     {
 80 |                         "text":"feature",
 81 |                         "type":"entity"
 82 |                     }
 83 |                 ]
 84 |             },
 85 |             "body":{
 86 |                 "text":"Here I also write how useful Solr is...",
 87 |                 "keywords":[
 88 |                     {
 89 |                         "text":"Solr",
 90 |                         "type":"search engine"
 91 |                     }
 92 |                 ]
 93 |             },
 94 |             "comments":[
 95 |                 {
 96 |                     "date":"2016-04-10T11:30:00Z",
 97 |                     "author":"Bob",
 98 |                     "text":"You forgot that useful Solr's feature!",
 99 |                     "keywords":[
100 |                         {
101 |                             "text":"Solr",
102 |                             "type":"search engine"
103 |                         },
104 |                         {
105 |                             "text":"feature",
106 |                             "type":"entity"
107 |                         }
108 |                     ],
109 |                     "sentiment":"negative",
110 |                     "replies":[
111 |                         {
112 |                             "date":"2016-04-10T12:00:00Z",
113 |                             "author":"Dave",
114 |                             "text":"But it only appeared in Solr 5.5, after the post was written",
115 |                             "keywords":[
116 |                                 {
117 |                                     "text":"Solr",
118 |                                     "type":"search engine"
119 |                                 },
120 |                                 {
121 |                                     "text":"Solr 5.5",
122 |                                     "type":"search engine"
123 |                                 }
124 |                             ],
125 |                             "sentiment":"neutral"
126 |                         }
127 |                     ]
128 |                 },
129 |                 {
130 |                     "date":"2015-12-12T05:00:00Z",
131 |                     "author":"Sri",
132 |                     "text":"Elasticsearch had it earlier than Solr",
133 |                     "keywords":[
134 |                         {
135 |                             "text":"Elasticsearch",
136 |                             "type":"search engine"
137 |                         },
138 |                         {
139 |                             "text":"Solr",
140 |                             "type":"search engine"
141 |                         }
142 |                     ],
143 |                     "sentiment":"negative"
144 |                 }
145 |             ]
146 |         }
147 |     ]
148 | }
149 | 


--------------------------------------------------------------------------------
/scripts/convert_data2solrjson.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys, os, getopt
  4 | import json
  5 | import codecs
  6 | import uuid
  7 | import time
  8 | 
  9 | ###############################
 10 | ##Sample of the output format:
 11 | ## [
 12 | ##  {id : book2, type_s:book, title_t : "Snow Crash", author_s : "Neal Stephenson",
 13 | ## cat_s:sci-fi, pubyear_i:1992, publisher_s:Bantam,
 14 | ## _childDocuments_ : [
 15 | ##   { id: book2_c1, type_s:review, review_dt:"N15-01-03T14:30:00Z",
 16 | ##     stars_i:5, author_s:yonik,
 17 | ##     comment_t:"Ahead of its time... I wonder if it helped inspire The Matrix?",
 18 | ##     _childDocuments_:[
 19 | ##       {id: book2_c1_e1, type_s:entity, text:"The Matrix", type:"movie" }
 20 | ##     ]
 21 | ##   }, ...
 22 | ## ]
 23 | ##
 24 | ##
 25 | #####################################
 26 | 
 27 | TYPE_FIELD_NAME = "path"
 28 | 
 29 | 
 30 | def make_uid():
 31 |     return str(uuid.uuid4().fields[-1])[:5]
 32 | 
 33 | ## load from .json
 34 | def load_from_json(fname):
 35 |     obj = {}
 36 |     # if file exists
 37 |     if os.path.isfile(fname):
 38 |        fin = codecs.open(fname, encoding = 'utf-8')
 39 |        obj = json.load(fin)
 40 |     else:
 41 |        print "No .json file  found... Exiting"
 42 |        sys.stderr.write("No .json file  found... Exiting\n")
 43 |        sys.exit(-1)
 44 |     return obj
 45 | ##eof load_from_json()
 46 | 
 47 | ## dump to .json
 48 | def dump_to_json(obj, fname):
 49 |     fout = codecs.open(fname, encoding = 'utf-8', mode = 'w')
 50 |     json.dump(obj, fout, ensure_ascii = False, indent=4, separators=(',', ': '))
 51 |     fout.close()
 52 | ##eof dump_to_json()
 53 | 
 54 | def path_to_str(path):
 55 |     path_str = ""
 56 |     level = len(path)+1
 57 |     for e in path:
 58 |         path_str+=e+"."
 59 |     return str(level)+"."+path_str
 60 | 
 61 | 
 62 | 
 63 | print_flag = False
 64 | N = 10
 65 | def reformat_to_solr_with_path(d_original, d_solr, top_id, top_type, path):
 66 |     #print "top_type = ", top_type
 67 | 
 68 |     path.append(top_type)
 69 |     if len(path) < N and print_flag:
 70 |         print path
 71 | 
 72 | 
 73 |     for k, v in d_original.iteritems():
 74 |         #for dicts, i.e., objects
 75 |         if path[-1] in d_original.keys() and path[-1] != "text":
 76 |             popped = path.pop()
 77 |             if len(path) < N and print_flag:
 78 |                 print "popped at the beginning =", popped
 79 | 
 80 |         if isinstance(v, dict):
 81 |             #print "k = ", k
 82 |             #print k, " : dict"
 83 |             if "_childDocuments_" in d_solr:
 84 |                 if top_type !="":
 85 |                     #d_solr["_childDocuments_"].append({"type_s":top_type+"."+k})
 86 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
 87 |                     #top_type = top_type+"."+k
 88 |                 else:
 89 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
 90 |                 n = len(d_solr["_childDocuments_"])
 91 |                 uid = make_uid()
 92 |                 d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid
 93 |                 reformat_to_solr_with_path(v, d_solr["_childDocuments_"][n-1], top_id, k, path)
 94 |             else:
 95 |                 d_solr["_childDocuments_"] = []
 96 |                 if top_type !="":
 97 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
 98 |                 else:
 99 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
100 |                 uid = make_uid()
101 |                 d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid
102 |                 reformat_to_solr_with_path(v, d_solr["_childDocuments_"][0], top_id, k, path)
103 | 
104 |             if path[-1] in v.keys():
105 |                 popped = path.pop()
106 |                 if len(path) < N and print_flag:
107 |                     print "popped child at the end =", popped
108 |             if path[-1] == k :
109 |                 popped = path.pop()
110 |                 if len(path) < N and print_flag:
111 |                     print "popped k at the end =", popped
112 | 
113 |         #for list of dicts, i.e., objects
114 |         elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
115 |             for d_obj in v:
116 |                 if path[-1] == k:
117 |                     popped = path.pop()
118 |                     if len(path) < N and print_flag:
119 |                         print "popped on cycle =", popped
120 | 
121 |                 if "_childDocuments_" in d_solr:
122 |                     if top_type !="":
123 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
124 |                     else:
125 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
126 |                     n = len(d_solr["_childDocuments_"])
127 |                     uid = make_uid()
128 |                     d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid
129 |                     reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][n-1], top_id, k, path)
130 |                 else:
131 |                     d_solr["_childDocuments_"] = []
132 |                     if top_type !="":
133 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
134 |                     else:
135 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
136 |                     uid = make_uid()
137 |                     d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid
138 |                     reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][0], top_id, k, path)
139 | 
140 |         else:
141 |             # PREPROCESSING OF THE LEAF FIELDS #
142 | 
143 |             #adds "raw" (not tokenized) field just for the "text" field
144 |             # only for Query 2
145 |             #if k == "text":
146 |             #    k1 = "text_s"
147 | 
148 |             #conversion of date
149 |             #if k == "date":
150 |                 ## Date in Solr:
151 |                 ## 1972-05-NT17:33:18Z
152 |                 #k = "date_tdt"
153 |                 #v = long(v)
154 |                 #v = time.strftime('%Y-%m-%dT%H:%M:%SZ',  time.gmtime(v/1000.))
155 |             #else:
156 |                 #add "raw" field for every field
157 |                 #k1 = k+"_ss"
158 |                 #d_solr[k1] = v
159 | 
160 |                 #adding "_t_s" suffix
161 |                 #for further conversion into *_t and *_s through CopyField rules
162 |                 #k = k+"_ts"
163 | 
164 |             d_solr[k] = v
165 |             #print "{0} : {1}".format(k, v)
166 | #eo reformat_to_solr_with_path
167 | 
168 | 
169 | 
170 | #wraps the format conversion process
171 | def convert_to_solr(fin_nm, fout_nm):
172 |     obj = load_from_json(fin_nm)
173 | 
174 |     #types of top-level documents
175 |     top_doctypes = obj.keys()
176 | 
177 |     data_solr = []
178 |     for top_doctype in top_doctypes:
179 |         for top_doc in obj[top_doctype]:
180 | 
181 |             top_doc_solr = {TYPE_FIELD_NAME:"1."+top_doctype}
182 |             top_id = make_uid()
183 |             top_doc_solr["id"] = top_id
184 |             path = []
185 |             reformat_to_solr_with_path(top_doc, top_doc_solr, top_id, top_doctype, path)
186 |             print_flag = False
187 |             data_solr.append(top_doc_solr)
188 | 
189 |     dump_to_json(data_solr, fout_nm)
190 | #eo convert_to_solr(fin_nm, fout_nm)
191 | 
192 | def main(argv):
193 |     inputfile_nm = ''
194 |     outputfile_nm = ''
195 |     fin = sys.stdin
196 |     fout = sys.stdout
197 | 
198 |     try:
199 |         opts, args = getopt.getopt(argv[1:],"hi:o:",["ifile=","ofile="])
200 |     except getopt.GetoptError:
201 |         print 'Usage {script} -i <inputfile> -o <outputfile>'.format(script = argv[0])
202 |         sys.exit(2)
203 |     for opt, arg in opts:
204 |         if opt == '-h':
205 |             print 'Usage {script} -i <inputfile> -o <outputfile>'.format(script = argv[0])
206 |             sys.exit()
207 |         elif opt in ('-i', "--ifile"):
208 |             inputfile_nm = arg
209 |         elif opt in ("-o", "--ofile"):
210 |             outputfile_nm = arg
211 | 
212 |     print 'Input file is "', inputfile_nm
213 |     print 'Output file is "', outputfile_nm
214 | 
215 |     convert_to_solr(inputfile_nm, outputfile_nm)
216 | 
217 | 
218 | if __name__ == "__main__":
219 |    main(sys.argv)
220 | 


--------------------------------------------------------------------------------
/data/example-data-solr.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "date": "2015-04-10T9:00:00Z",
  4 |         "path": "1.blog-posts",
  5 |         "_childDocuments_": [
  6 |             {
  7 |                 "path": "2.blog-posts.body",
  8 |                 "_childDocuments_": [
  9 |                     {
 10 |                         "path": "3.blog-posts.body.keywords",
 11 |                         "type": "search engine",
 12 |                         "id": "91701-21349",
 13 |                         "text": "Solr"
 14 |                     }
 15 |                 ],
 16 |                 "id": "91701-28526",
 17 |                 "text": "Here I write how useful Solr is..."
 18 |             },
 19 |             {
 20 |                 "path": "2.blog-posts.title",
 21 |                 "_childDocuments_": [
 22 |                     {
 23 |                         "path": "3.blog-posts.title.keywords",
 24 |                         "type": "search engine",
 25 |                         "id": "91701-73494",
 26 |                         "text": "Solr"
 27 |                     },
 28 |                     {
 29 |                         "path": "3.blog-posts.title.keywords",
 30 |                         "type": "entity",
 31 |                         "id": "91701-57714",
 32 |                         "text": "Search Engine"
 33 |                     }
 34 |                 ],
 35 |                 "id": "91701-69789",
 36 |                 "text": "My Post #1: About Solr and Other Search Engines"
 37 |             },
 38 |             {
 39 |                 "sentiment": "positive",
 40 |                 "author": "Bob",
 41 |                 "text": "Great post about Solr",
 42 |                 "_childDocuments_": [
 43 |                     {
 44 |                         "path": "3.blog-posts.comments.keywords",
 45 |                         "type": "search engine",
 46 |                         "id": "91701-14067",
 47 |                         "text": "Solr"
 48 |                     },
 49 |                     {
 50 |                         "sentiment": "positive",
 51 |                         "author": "Dave",
 52 |                         "text": "Yeah, I like Solr too",
 53 |                         "_childDocuments_": [
 54 |                             {
 55 |                                 "path": "4.blog-posts.comments.replies.keywords",
 56 |                                 "type": "search engine",
 57 |                                 "id": "91701-19743",
 58 |                                 "text": "Solr"
 59 |                             }
 60 |                         ],
 61 |                         "date": "2015-04-10T12:00:00Z",
 62 |                         "path": "3.blog-posts.comments.replies",
 63 |                         "id": "91701-11855"
 64 |                     },
 65 |                     {
 66 |                         "sentiment": "negative",
 67 |                         "author": "Sri",
 68 |                         "text": "I disagree, I prefer Elasticsearch",
 69 |                         "_childDocuments_": [
 70 |                             {
 71 |                                 "path": "4.blog-posts.comments.replies.keywords",
 72 |                                 "type": "search engine",
 73 |                                 "id": "91701-17221",
 74 |                                 "text": "Elasticsearch"
 75 |                             }
 76 |                         ],
 77 |                         "date": "2015-04-12T05:00:00Z",
 78 |                         "path": "3.blog-posts.comments.replies",
 79 |                         "id": "91701-27479"
 80 |                     }
 81 |                 ],
 82 |                 "date": "2015-04-10T11:30:00Z",
 83 |                 "path": "2.blog-posts.comments",
 84 |                 "id": "91701-23917"
 85 |             }
 86 |         ],
 87 |         "id": "91701",
 88 |         "author": "Alice"
 89 |     },
 90 |     {
 91 |         "date": "2015-11-10T9:00:00Z",
 92 |         "path": "1.blog-posts",
 93 |         "_childDocuments_": [
 94 |             {
 95 |                 "path": "2.blog-posts.body",
 96 |                 "_childDocuments_": [
 97 |                     {
 98 |                         "path": "3.blog-posts.body.keywords",
 99 |                         "type": "search engine",
100 |                         "id": "24397-33681",
101 |                         "text": "Solr"
102 |                     }
103 |                 ],
104 |                 "id": "24397-18001",
105 |                 "text": "Here I also write how useful Solr is..."
106 |             },
107 |             {
108 |                 "path": "2.blog-posts.title",
109 |                 "_childDocuments_": [
110 |                     {
111 |                         "path": "3.blog-posts.title.keywords",
112 |                         "type": "search engine",
113 |                         "id": "24397-12687",
114 |                         "text": "Solr"
115 |                     },
116 |                     {
117 |                         "path": "3.blog-posts.title.keywords",
118 |                         "type": "entity",
119 |                         "id": "24397-83428",
120 |                         "text": "feature"
121 |                     }
122 |                 ],
123 |                 "id": "24397-38220",
124 |                 "text": "About useful features of Solr"
125 |             },
126 |             {
127 |                 "sentiment": "negative",
128 |                 "author": "Bob",
129 |                 "text": "You forgot that useful Solr's feature!",
130 |                 "_childDocuments_": [
131 |                     {
132 |                         "path": "3.blog-posts.comments.keywords",
133 |                         "type": "search engine",
134 |                         "id": "24397-19289",
135 |                         "text": "Solr"
136 |                     },
137 |                     {
138 |                         "path": "3.blog-posts.comments.keywords",
139 |                         "type": "entity",
140 |                         "id": "24397-27032",
141 |                         "text": "feature"
142 |                     },
143 |                     {
144 |                         "sentiment": "neutral",
145 |                         "author": "Dave",
146 |                         "text": "But it only appeared in Solr 5.5, after the post was written",
147 |                         "_childDocuments_": [
148 |                             {
149 |                                 "path": "4.blog-posts.comments.replies.keywords",
150 |                                 "type": "search engine",
151 |                                 "id": "24397-15030",
152 |                                 "text": "Solr"
153 |                             },
154 |                             {
155 |                                 "path": "4.blog-posts.comments.replies.keywords",
156 |                                 "type": "search engine",
157 |                                 "id": "24397-26659",
158 |                                 "text": "Solr 5.5"
159 |                             }
160 |                         ],
161 |                         "date": "2016-04-10T12:00:00Z",
162 |                         "path": "3.blog-posts.comments.replies",
163 |                         "id": "24397-11623"
164 |                     }
165 |                 ],
166 |                 "date": "2016-04-10T11:30:00Z",
167 |                 "path": "2.blog-posts.comments",
168 |                 "id": "24397-22877"
169 |             },
170 |             {
171 |                 "sentiment": "negative",
172 |                 "author": "Sri",
173 |                 "text": "Elasticsearch had it earlier than Solr",
174 |                 "_childDocuments_": [
175 |                     {
176 |                         "path": "3.blog-posts.comments.keywords",
177 |                         "type": "search engine",
178 |                         "id": "24397-26016",
179 |                         "text": "Elasticsearch"
180 |                     },
181 |                     {
182 |                         "path": "3.blog-posts.comments.keywords",
183 |                         "type": "search engine",
184 |                         "id": "24397-17834",
185 |                         "text": "Solr"
186 |                     }
187 |                 ],
188 |                 "date": "2015-12-12T05:00:00Z",
189 |                 "path": "2.blog-posts.comments",
190 |                 "id": "24397-20101"
191 |             }
192 |         ],
193 |         "id": "24397",
194 |         "author": "Aadit"
195 |     }
196 | ]


--------------------------------------------------------------------------------
/scripts/convert_data2solrjson_for_faceting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys, os, getopt
  4 | import json
  5 | import codecs
  6 | import uuid
  7 | import time
  8 | 
  9 | ###############################
 10 | ##Sample of the output format:
 11 | ## [
 12 | ##  {id : book2, type_s:book, title_t : "Snow Crash", author_s : "Neal Stephenson",
 13 | ## cat_s:sci-fi, pubyear_i:1992, publisher_s:Bantam,
 14 | ## _childDocuments_ : [
 15 | ##   { id: book2_c1, type_s:review, review_dt:"N15-01-03T14:30:00Z",
 16 | ##     stars_i:5, author_s:yonik,
 17 | ##     comment_t:"Ahead of its time... I wonder if it helped inspire The Matrix?",
 18 | ##     _childDocuments_:[
 19 | ##       {id: book2_c1_e1, type_s:entity, text:"The Matrix", type:"movie" }
 20 | ##     ]
 21 | ##   }, ...
 22 | ## ]
 23 | ##
 24 | ##
 25 | #####################################
 26 | 
 27 | TYPE_FIELD_NAME = "path"
 28 | 
 29 | 
 30 | def make_uid():
 31 |     return str(uuid.uuid4().fields[-1])[:5]
 32 | 
 33 | ## load from .json
 34 | def load_from_json(fname):
 35 |     obj = {}
 36 |     # if file exists
 37 |     if os.path.isfile(fname):
 38 |        fin = codecs.open(fname, encoding = 'utf-8')
 39 |        obj = json.load(fin)
 40 |     else:
 41 |        print "No .json file  found... Exiting"
 42 |        sys.stderr.write("No .json file  found... Exiting\n")
 43 |        sys.exit(-1)
 44 |     return obj
 45 | ##eof load_from_json()
 46 | 
 47 | ## dump to .json
 48 | def dump_to_json(obj, fname):
 49 |     fout = codecs.open(fname, encoding = 'utf-8', mode = 'w')
 50 |     json.dump(obj, fout, ensure_ascii = False, indent=4, separators=(',', ': '))
 51 |     fout.close()
 52 | ##eof dump_to_json()
 53 | 
 54 | def path_to_str(path):
 55 |     path_str = ""
 56 |     level = len(path)+1
 57 |     for e in path:
 58 |         path_str+=e+"."
 59 |     return str(level)+"."+path_str
 60 | 
 61 | 
 62 | #global var for tracking unique fields from upper levels
 63 | 
 64 | unique_fields_map={}
 65 | ## generates unique field and adda parent field for faceting
 66 | def add_unique_and_parent_fields(d_solr, n, path, k, top_type):
 67 |     global unique_fields_map
 68 |     if top_type !="":
 69 |         uid = make_uid()
 70 |         unique_field_name = path_to_str(path)+k+"-id"
 71 |         d_solr["_childDocuments_"][n][unique_field_name] =  uid
 72 |         #if it is on the 2nd level, update the map
 73 |         if unique_field_name.find("2.blog-posts.") > -1:
 74 |             unique_fields_map[unique_field_name] = uid
 75 |         #else, propagate the unique field for the branch stemming from level 2
 76 |         else:
 77 |             second_level_part = unique_field_name[1: unique_field_name.find(".", 13)]
 78 |             second_level_unique_filed_name = "2"+second_level_part+"-id"
 79 |             d_solr["_childDocuments_"][n][second_level_unique_filed_name] =  unique_fields_map[second_level_unique_filed_name]
 80 |     else:
 81 |         unique_fields_map = {}
 82 | 
 83 | 
 84 | 
 85 | print_flag = False
 86 | N = 10
 87 | def reformat_to_solr_with_path(d_original, d_solr, top_id, top_type, path):
 88 |     #print "top_type = ", top_type
 89 | 
 90 |     path.append(top_type)
 91 |     if len(path) < N and print_flag:
 92 |         print path
 93 | 
 94 | 
 95 |     for k, v in d_original.iteritems():
 96 |         #for dicts, i.e., objects
 97 |         if path[-1] in d_original.keys() and path[-1] != "text":
 98 |             popped = path.pop()
 99 |             if len(path) < N and print_flag:
100 |                 print "popped at the beginning =", popped
101 | 
102 |         if isinstance(v, dict):
103 |             #print "k = ", k
104 |             #print k, " : dict"
105 |             if "_childDocuments_" in d_solr:
106 |                 if top_type !="":
107 |                     #d_solr["_childDocuments_"].append({"type_s":top_type+"."+k})
108 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
109 |                     #top_type = top_type+"."+k
110 |                 else:
111 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
112 |                 n = len(d_solr["_childDocuments_"])
113 |                 uid = make_uid()
114 |                 d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid
115 |                 add_unique_and_parent_fields(d_solr, n-1, path, k, top_type)
116 | 
117 |                 reformat_to_solr_with_path(v, d_solr["_childDocuments_"][n-1], top_id, k, path)
118 |             else:
119 |                 d_solr["_childDocuments_"] = []
120 |                 if top_type !="":
121 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
122 |                 else:
123 |                     d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
124 |                 uid = make_uid()
125 |                 d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid
126 |                 add_unique_and_parent_fields(d_solr, 0, path, k, top_type)
127 |                 reformat_to_solr_with_path(v, d_solr["_childDocuments_"][0], top_id, k, path)
128 | 
129 |             if path[-1] in v.keys():
130 |                 popped = path.pop()
131 |                 if len(path) < N and print_flag:
132 |                     print "popped child at the end =", popped
133 |             if path[-1] == k :
134 |                 popped = path.pop()
135 |                 if len(path) < N and print_flag:
136 |                     print "popped k at the end =", popped
137 | 
138 |         #for list of dicts, i.e., objects
139 |         elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
140 |             for d_obj in v:
141 |                 if path[-1] == k:
142 |                     popped = path.pop()
143 |                     if len(path) < N and print_flag:
144 |                         print "popped on cycle =", popped
145 | 
146 |                 if "_childDocuments_" in d_solr:
147 |                     if top_type !="":
148 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
149 |                     else:
150 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
151 |                     n = len(d_solr["_childDocuments_"])
152 |                     uid = make_uid()
153 |                     d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid
154 |                     add_unique_and_parent_fields(d_solr, n-1, path, k, top_type)
155 |                     reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][n-1], top_id, k, path)
156 |                 else:
157 |                     d_solr["_childDocuments_"] = []
158 |                     if top_type !="":
159 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k})
160 |                     else:
161 |                         d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k})
162 |                     uid = make_uid()
163 |                     d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid
164 |                     add_unique_and_parent_fields(d_solr, 0, path, k, top_type)
165 |                     reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][0], top_id, k, path)
166 | 
167 |         else:
168 |             # PREPROCESSING OF THE LEAF FIELDS #
169 | 
170 |             #adds "raw" (not tokenized) field just for the "text" field
171 |             # only for Query 2
172 |             #if k == "text":
173 |             #    k1 = "text_s"
174 | 
175 |             #conversion of date
176 |             #if k == "date":
177 |                 ## Date in Solr:
178 |                 ## 1972-05-NT17:33:18Z
179 |                 #k = "date_tdt"
180 |                 #v = long(v)
181 |                 #v = time.strftime('%Y-%m-%dT%H:%M:%SZ',  time.gmtime(v/1000.))
182 |             #elif k == "path":
183 |                 #k = k+"_s"
184 |             #else:
185 |                 #add "raw" field for every field
186 |                 #k1 = k+"_ss"
187 |                 #d_solr[k1] = v
188 | 
189 |                 #adding "_t_s" suffix
190 |                 #for further conversion into *_t and *_s through CopyField rules
191 |                 #k = k+"_ts"
192 | 
193 |             d_solr[k] = v
194 |             #print "{0} : {1}".format(k, v)
195 | #eo reformat_to_solr_with_path
196 | 
197 | 
198 | 
199 | #wraps the format conversion process
200 | def convert_to_solr(fin_nm, fout_nm):
201 |     obj = load_from_json(fin_nm)
202 | 
203 |     #types of top-level documents
204 |     top_doctypes = obj.keys()
205 | 
206 |     data_solr = []
207 |     for top_doctype in top_doctypes:
208 |         for top_doc in obj[top_doctype]:
209 | 
210 |             top_doc_solr = {TYPE_FIELD_NAME:"1."+top_doctype}
211 |             top_id = make_uid()
212 |             top_doc_solr["id"] = top_id
213 |             path = []
214 |             reformat_to_solr_with_path(top_doc, top_doc_solr, top_id, top_doctype, path)
215 |             print_flag = False
216 |             data_solr.append(top_doc_solr)
217 | 
218 |     dump_to_json(data_solr, fout_nm)
219 | #eo convert_to_solr(fin_nm, fout_nm)
220 | 
221 | def main(argv):
222 |     inputfile_nm = ''
223 |     outputfile_nm = ''
224 |     fin = sys.stdin
225 |     fout = sys.stdout
226 | 
227 |     try:
228 |         opts, args = getopt.getopt(argv[1:],"hi:o:",["ifile=","ofile="])
229 |     except getopt.GetoptError:
230 |         print 'Usage {script} -i <inputfile> -o <outputfile>'.format(script = argv[0])
231 |         sys.exit(2)
232 |     for opt, arg in opts:
233 |         if opt == '-h':
234 |             print 'Usage {script} -i <inputfile> -o <outputfile>'.format(script = argv[0])
235 |             sys.exit()
236 |         elif opt in ('-i', "--ifile"):
237 |             inputfile_nm = arg
238 |         elif opt in ("-o", "--ofile"):
239 |             outputfile_nm = arg
240 | 
241 |     print 'Input file is "', inputfile_nm
242 |     print 'Output file is "', outputfile_nm
243 | 
244 |     convert_to_solr(inputfile_nm, outputfile_nm)
245 | 
246 | 
247 | if __name__ == "__main__":
248 |    main(sys.argv)
249 | 


--------------------------------------------------------------------------------
/data/example-data-solr-for-faceting.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "date": "2015-04-10T9:00:00Z",
  4 |         "path": "1.blog-posts",
  5 |         "_childDocuments_": [
  6 |             {
  7 |                 "path": "2.blog-posts.body",
  8 |                 "2.blog-posts.body-id": "50224",
  9 |                 "id": "17252-18262",
 10 |                 "_childDocuments_": [
 11 |                     {
 12 |                         "2.blog-posts.body-id": "50224",
 13 |                         "text": "Solr",
 14 |                         "3.blog-posts.body.keywords-id": "23072",
 15 |                         "path": "3.blog-posts.body.keywords",
 16 |                         "type": "search engine",
 17 |                         "id": "17252-67394"
 18 |                     }
 19 |                 ],
 20 |                 "text": "Here I write how useful Solr is..."
 21 |             },
 22 |             {
 23 |                 "path": "2.blog-posts.title",
 24 |                 "_childDocuments_": [
 25 |                     {
 26 |                         "text": "Solr",
 27 |                         "3.blog-posts.title.keywords-id": "16242",
 28 |                         "2.blog-posts.title-id": "62235",
 29 |                         "path": "3.blog-posts.title.keywords",
 30 |                         "type": "search engine",
 31 |                         "id": "17252-18776"
 32 |                     },
 33 |                     {
 34 |                         "text": "Search Engine",
 35 |                         "3.blog-posts.title.keywords-id": "37837",
 36 |                         "2.blog-posts.title-id": "62235",
 37 |                         "path": "3.blog-posts.title.keywords",
 38 |                         "type": "entity",
 39 |                         "id": "17252-52565"
 40 |                     }
 41 |                 ],
 42 |                 "text": "My Post #1: About Solr and Other Search Engines",
 43 |                 "id": "17252-18960",
 44 |                 "2.blog-posts.title-id": "62235"
 45 |             },
 46 |             {
 47 |                 "sentiment": "positive",
 48 |                 "author": "Bob",
 49 |                 "text": "Great post about Solr",
 50 |                 "_childDocuments_": [
 51 |                     {
 52 |                         "text": "Solr",
 53 |                         "2.blog-posts.comments-id": "32759",
 54 |                         "path": "3.blog-posts.comments.keywords",
 55 |                         "3.blog-posts.comments.keywords-id": "70412",
 56 |                         "type": "search engine",
 57 |                         "id": "17252-19514"
 58 |                     },
 59 |                     {
 60 |                         "sentiment": "positive",
 61 |                         "author": "Dave",
 62 |                         "text": "Yeah, I like Solr too",
 63 |                         "_childDocuments_": [
 64 |                             {
 65 |                                 "text": "Solr",
 66 |                                 "4.blog-posts.comments.replies.keywords-id": "14028",
 67 |                                 "2.blog-posts.comments-id": "32759",
 68 |                                 "path": "4.blog-posts.comments.replies.keywords",
 69 |                                 "type": "search engine",
 70 |                                 "id": "17252-18477"
 71 |                             }
 72 |                         ],
 73 |                         "2.blog-posts.comments-id": "32759",
 74 |                         "3.blog-posts.comments.replies-id": "63952",
 75 |                         "date": "2015-04-10T12:00:00Z",
 76 |                         "path": "3.blog-posts.comments.replies",
 77 |                         "id": "17252-53695"
 78 |                     },
 79 |                     {
 80 |                         "sentiment": "negative",
 81 |                         "author": "Sri",
 82 |                         "text": "I disagree, I prefer Elasticsearch",
 83 |                         "_childDocuments_": [
 84 |                             {
 85 |                                 "text": "Elasticsearch",
 86 |                                 "4.blog-posts.comments.replies.keywords-id": "28848",
 87 |                                 "2.blog-posts.comments-id": "32759",
 88 |                                 "path": "4.blog-posts.comments.replies.keywords",
 89 |                                 "type": "search engine",
 90 |                                 "id": "17252-13445"
 91 |                             }
 92 |                         ],
 93 |                         "2.blog-posts.comments-id": "32759",
 94 |                         "3.blog-posts.comments.replies-id": "80429",
 95 |                         "date": "2015-04-12T05:00:00Z",
 96 |                         "path": "3.blog-posts.comments.replies",
 97 |                         "id": "17252-19383"
 98 |                     }
 99 |                 ],
100 |                 "2.blog-posts.comments-id": "32759",
101 |                 "date": "2015-04-10T11:30:00Z",
102 |                 "path": "2.blog-posts.comments",
103 |                 "id": "17252-24737"
104 |             }
105 |         ],
106 |         "id": "17252",
107 |         "author": "Alice"
108 |     },
109 |     {
110 |         "date": "2015-11-10T9:00:00Z",
111 |         "path": "1.blog-posts",
112 |         "_childDocuments_": [
113 |             {
114 |                 "path": "2.blog-posts.body",
115 |                 "2.blog-posts.body-id": "63622",
116 |                 "id": "24220-20881",
117 |                 "_childDocuments_": [
118 |                     {
119 |                         "2.blog-posts.body-id": "63622",
120 |                         "text": "Solr",
121 |                         "3.blog-posts.body.keywords-id": "14615",
122 |                         "path": "3.blog-posts.body.keywords",
123 |                         "type": "search engine",
124 |                         "id": "24220-94803"
125 |                     }
126 |                 ],
127 |                 "text": "Here I also write how useful Solr is..."
128 |             },
129 |             {
130 |                 "path": "2.blog-posts.title",
131 |                 "_childDocuments_": [
132 |                     {
133 |                         "text": "Solr",
134 |                         "3.blog-posts.title.keywords-id": "10276",
135 |                         "2.blog-posts.title-id": "82579",
136 |                         "path": "3.blog-posts.title.keywords",
137 |                         "type": "search engine",
138 |                         "id": "24220-11828"
139 |                     },
140 |                     {
141 |                         "text": "feature",
142 |                         "3.blog-posts.title.keywords-id": "26821",
143 |                         "2.blog-posts.title-id": "82579",
144 |                         "path": "3.blog-posts.title.keywords",
145 |                         "type": "entity",
146 |                         "id": "24220-68954"
147 |                     }
148 |                 ],
149 |                 "text": "About useful features of Solr",
150 |                 "id": "24220-21378",
151 |                 "2.blog-posts.title-id": "82579"
152 |             },
153 |             {
154 |                 "sentiment": "negative",
155 |                 "author": "Bob",
156 |                 "text": "You forgot that useful Solr's feature!",
157 |                 "_childDocuments_": [
158 |                     {
159 |                         "text": "Solr",
160 |                         "2.blog-posts.comments-id": "69776",
161 |                         "path": "3.blog-posts.comments.keywords",
162 |                         "3.blog-posts.comments.keywords-id": "25182",
163 |                         "type": "search engine",
164 |                         "id": "24220-17760"
165 |                     },
166 |                     {
167 |                         "text": "feature",
168 |                         "2.blog-posts.comments-id": "69776",
169 |                         "path": "3.blog-posts.comments.keywords",
170 |                         "3.blog-posts.comments.keywords-id": "58954",
171 |                         "type": "entity",
172 |                         "id": "24220-26183"
173 |                     },
174 |                     {
175 |                         "sentiment": "neutral",
176 |                         "author": "Dave",
177 |                         "text": "But it only appeared in Solr 5.5, after the post was written",
178 |                         "_childDocuments_": [
179 |                             {
180 |                                 "text": "Solr",
181 |                                 "4.blog-posts.comments.replies.keywords-id": "83968",
182 |                                 "2.blog-posts.comments-id": "69776",
183 |                                 "path": "4.blog-posts.comments.replies.keywords",
184 |                                 "type": "search engine",
185 |                                 "id": "24220-20038"
186 |                             },
187 |                             {
188 |                                 "text": "Solr 5.5",
189 |                                 "4.blog-posts.comments.replies.keywords-id": "24056",
190 |                                 "2.blog-posts.comments-id": "69776",
191 |                                 "path": "4.blog-posts.comments.replies.keywords",
192 |                                 "type": "search engine",
193 |                                 "id": "24220-19125"
194 |                             }
195 |                         ],
196 |                         "2.blog-posts.comments-id": "69776",
197 |                         "3.blog-posts.comments.replies-id": "24390",
198 |                         "date": "2016-04-10T12:00:00Z",
199 |                         "path": "3.blog-posts.comments.replies",
200 |                         "id": "24220-16949"
201 |                     }
202 |                 ],
203 |                 "2.blog-posts.comments-id": "69776",
204 |                 "date": "2016-04-10T11:30:00Z",
205 |                 "path": "2.blog-posts.comments",
206 |                 "id": "24220-26951"
207 |             },
208 |             {
209 |                 "sentiment": "negative",
210 |                 "author": "Sri",
211 |                 "text": "Elasticsearch had it earlier than Solr",
212 |                 "_childDocuments_": [
213 |                     {
214 |                         "text": "Elasticsearch",
215 |                         "2.blog-posts.comments-id": "18323",
216 |                         "path": "3.blog-posts.comments.keywords",
217 |                         "3.blog-posts.comments.keywords-id": "12980",
218 |                         "type": "search engine",
219 |                         "id": "24220-52064"
220 |                     },
221 |                     {
222 |                         "text": "Solr",
223 |                         "2.blog-posts.comments-id": "18323",
224 |                         "path": "3.blog-posts.comments.keywords",
225 |                         "3.blog-posts.comments.keywords-id": "24594",
226 |                         "type": "search engine",
227 |                         "id": "24220-14872"
228 |                     }
229 |                 ],
230 |                 "2.blog-posts.comments-id": "18323",
231 |                 "date": "2015-12-12T05:00:00Z",
232 |                 "path": "2.blog-posts.comments",
233 |                 "id": "24220-20919"
234 |             }
235 |         ],
236 |         "id": "24220",
237 |         "author": "Aadit"
238 |     }
239 | ]
240 | 


--------------------------------------------------------------------------------