├── readme.md ├── .gitignore ├── data ├── example-data.json ├── example-data-solr.json └── example-data-solr-for-faceting.json └── scripts ├── convert_data2solrjson.py └── convert_data2solrjson_for_faceting.py /readme.md: -------------------------------------------------------------------------------- 1 | These files accompany my blog post on nested document handling capabilities of Solr 5.3.1 and 5.5.0: 2 | https://medium.com/@alisazhila/solr-s-nesting-on-solr-s-capabilities-to-handle-deeply-nested-document-structures-50eeaaa4347a#.90xb5dqo8 3 | 4 | ### Script usage: 5 | 6 | ```{bash} 7 | $ python ./scripts/convert_data2solrjson.py -i ./data/example-data.json -o ./data/example-data-solr.json 8 | 9 | $ python ./scripts/convert_data2solrjson_for_faceting.py -i ./data/example-data.json -o ./data/example-data-solr-for-faceting.json 10 | ``` 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /data/example-data.json: -------------------------------------------------------------------------------- 1 | { 2 | "blog-posts":[ 3 | { 4 | "date":"2015-04-10T9:00:00Z", 5 | "author":"Alice", 6 | "title":{ 7 | "text":"My Post #1: About Solr and Other Search Engines", 8 | "keywords":[ 9 | { 10 | "text":"Solr", 11 | "type":"search engine" 12 | }, 13 | { 14 | "text":"Search Engine", 15 | "type":"entity" 16 | } 17 | ] 18 | }, 19 | "body":{ 20 | "text":"Here I write how useful Solr is...", 21 | "keywords":[ 22 | { 23 | "text":"Solr", 24 | "type":"search engine" 25 | } 26 | ] 27 | }, 28 | "comments":[ 29 | { 30 | "date":"2015-04-10T11:30:00Z", 31 | "author":"Bob", 32 | "text":"Great post about Solr", 33 | "keywords":[ 34 | { 35 | "text":"Solr", 36 | "type":"search engine" 37 | } 38 | ], 39 | "sentiment":"positive", 40 | "replies":[ 41 | { 42 | "date":"2015-04-10T12:00:00Z", 43 | "author":"Dave", 44 | "text":"Yeah, I like Solr too", 45 | "keywords":[ 46 | { 47 | "text":"Solr", 48 | "type":"search engine" 49 | } 50 | ], 51 | "sentiment":"positive" 52 | }, 53 | { 54 | "date":"2015-04-12T05:00:00Z", 55 | "author":"Sri", 56 | "text":"I disagree, I prefer Elasticsearch", 57 | "keywords":[ 58 | { 59 | "text":"Elasticsearch", 60 | "type":"search engine" 61 | } 62 | ], 63 | "sentiment":"negative" 64 | } 65 | ] 66 | } 67 | ] 68 | }, 69 | { 70 | "date":"2015-11-10T9:00:00Z", 71 | "author":"Aadit", 72 | "title":{ 73 | "text":"About useful features of Solr", 74 | "keywords":[ 75 | { 76 | "text":"Solr", 77 | "type":"search engine" 78 | }, 79 | { 80 | "text":"feature", 81 | "type":"entity" 82 | } 83 | ] 84 | }, 85 | "body":{ 86 | "text":"Here I also write how useful Solr is...", 87 | "keywords":[ 88 | { 89 | "text":"Solr", 90 | "type":"search engine" 91 | } 92 | ] 93 | }, 94 | "comments":[ 95 | { 96 | "date":"2016-04-10T11:30:00Z", 97 | "author":"Bob", 98 | "text":"You forgot that useful Solr's feature!", 99 | "keywords":[ 100 | { 101 | "text":"Solr", 102 | "type":"search engine" 103 | }, 104 | { 105 | "text":"feature", 106 | "type":"entity" 107 | } 108 | ], 109 | "sentiment":"negative", 110 | "replies":[ 111 | { 112 | "date":"2016-04-10T12:00:00Z", 113 | "author":"Dave", 114 | "text":"But it only appeared in Solr 5.5, after the post was written", 115 | "keywords":[ 116 | { 117 | "text":"Solr", 118 | "type":"search engine" 119 | }, 120 | { 121 | "text":"Solr 5.5", 122 | "type":"search engine" 123 | } 124 | ], 125 | "sentiment":"neutral" 126 | } 127 | ] 128 | }, 129 | { 130 | "date":"2015-12-12T05:00:00Z", 131 | "author":"Sri", 132 | "text":"Elasticsearch had it earlier than Solr", 133 | "keywords":[ 134 | { 135 | "text":"Elasticsearch", 136 | "type":"search engine" 137 | }, 138 | { 139 | "text":"Solr", 140 | "type":"search engine" 141 | } 142 | ], 143 | "sentiment":"negative" 144 | } 145 | ] 146 | } 147 | ] 148 | } 149 | -------------------------------------------------------------------------------- /scripts/convert_data2solrjson.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, os, getopt 4 | import json 5 | import codecs 6 | import uuid 7 | import time 8 | 9 | ############################### 10 | ##Sample of the output format: 11 | ## [ 12 | ## {id : book2, type_s:book, title_t : "Snow Crash", author_s : "Neal Stephenson", 13 | ## cat_s:sci-fi, pubyear_i:1992, publisher_s:Bantam, 14 | ## _childDocuments_ : [ 15 | ## { id: book2_c1, type_s:review, review_dt:"N15-01-03T14:30:00Z", 16 | ## stars_i:5, author_s:yonik, 17 | ## comment_t:"Ahead of its time... I wonder if it helped inspire The Matrix?", 18 | ## _childDocuments_:[ 19 | ## {id: book2_c1_e1, type_s:entity, text:"The Matrix", type:"movie" } 20 | ## ] 21 | ## }, ... 22 | ## ] 23 | ## 24 | ## 25 | ##################################### 26 | 27 | TYPE_FIELD_NAME = "path" 28 | 29 | 30 | def make_uid(): 31 | return str(uuid.uuid4().fields[-1])[:5] 32 | 33 | ## load from .json 34 | def load_from_json(fname): 35 | obj = {} 36 | # if file exists 37 | if os.path.isfile(fname): 38 | fin = codecs.open(fname, encoding = 'utf-8') 39 | obj = json.load(fin) 40 | else: 41 | print "No .json file found... Exiting" 42 | sys.stderr.write("No .json file found... Exiting\n") 43 | sys.exit(-1) 44 | return obj 45 | ##eof load_from_json() 46 | 47 | ## dump to .json 48 | def dump_to_json(obj, fname): 49 | fout = codecs.open(fname, encoding = 'utf-8', mode = 'w') 50 | json.dump(obj, fout, ensure_ascii = False, indent=4, separators=(',', ': ')) 51 | fout.close() 52 | ##eof dump_to_json() 53 | 54 | def path_to_str(path): 55 | path_str = "" 56 | level = len(path)+1 57 | for e in path: 58 | path_str+=e+"." 59 | return str(level)+"."+path_str 60 | 61 | 62 | 63 | print_flag = False 64 | N = 10 65 | def reformat_to_solr_with_path(d_original, d_solr, top_id, top_type, path): 66 | #print "top_type = ", top_type 67 | 68 | path.append(top_type) 69 | if len(path) < N and print_flag: 70 | print path 71 | 72 | 73 | for k, v in d_original.iteritems(): 74 | #for dicts, i.e., objects 75 | if path[-1] in d_original.keys() and path[-1] != "text": 76 | popped = path.pop() 77 | if len(path) < N and print_flag: 78 | print "popped at the beginning =", popped 79 | 80 | if isinstance(v, dict): 81 | #print "k = ", k 82 | #print k, " : dict" 83 | if "_childDocuments_" in d_solr: 84 | if top_type !="": 85 | #d_solr["_childDocuments_"].append({"type_s":top_type+"."+k}) 86 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 87 | #top_type = top_type+"."+k 88 | else: 89 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 90 | n = len(d_solr["_childDocuments_"]) 91 | uid = make_uid() 92 | d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid 93 | reformat_to_solr_with_path(v, d_solr["_childDocuments_"][n-1], top_id, k, path) 94 | else: 95 | d_solr["_childDocuments_"] = [] 96 | if top_type !="": 97 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 98 | else: 99 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 100 | uid = make_uid() 101 | d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid 102 | reformat_to_solr_with_path(v, d_solr["_childDocuments_"][0], top_id, k, path) 103 | 104 | if path[-1] in v.keys(): 105 | popped = path.pop() 106 | if len(path) < N and print_flag: 107 | print "popped child at the end =", popped 108 | if path[-1] == k : 109 | popped = path.pop() 110 | if len(path) < N and print_flag: 111 | print "popped k at the end =", popped 112 | 113 | #for list of dicts, i.e., objects 114 | elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict): 115 | for d_obj in v: 116 | if path[-1] == k: 117 | popped = path.pop() 118 | if len(path) < N and print_flag: 119 | print "popped on cycle =", popped 120 | 121 | if "_childDocuments_" in d_solr: 122 | if top_type !="": 123 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 124 | else: 125 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 126 | n = len(d_solr["_childDocuments_"]) 127 | uid = make_uid() 128 | d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid 129 | reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][n-1], top_id, k, path) 130 | else: 131 | d_solr["_childDocuments_"] = [] 132 | if top_type !="": 133 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 134 | else: 135 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 136 | uid = make_uid() 137 | d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid 138 | reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][0], top_id, k, path) 139 | 140 | else: 141 | # PREPROCESSING OF THE LEAF FIELDS # 142 | 143 | #adds "raw" (not tokenized) field just for the "text" field 144 | # only for Query 2 145 | #if k == "text": 146 | # k1 = "text_s" 147 | 148 | #conversion of date 149 | #if k == "date": 150 | ## Date in Solr: 151 | ## 1972-05-NT17:33:18Z 152 | #k = "date_tdt" 153 | #v = long(v) 154 | #v = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(v/1000.)) 155 | #else: 156 | #add "raw" field for every field 157 | #k1 = k+"_ss" 158 | #d_solr[k1] = v 159 | 160 | #adding "_t_s" suffix 161 | #for further conversion into *_t and *_s through CopyField rules 162 | #k = k+"_ts" 163 | 164 | d_solr[k] = v 165 | #print "{0} : {1}".format(k, v) 166 | #eo reformat_to_solr_with_path 167 | 168 | 169 | 170 | #wraps the format conversion process 171 | def convert_to_solr(fin_nm, fout_nm): 172 | obj = load_from_json(fin_nm) 173 | 174 | #types of top-level documents 175 | top_doctypes = obj.keys() 176 | 177 | data_solr = [] 178 | for top_doctype in top_doctypes: 179 | for top_doc in obj[top_doctype]: 180 | 181 | top_doc_solr = {TYPE_FIELD_NAME:"1."+top_doctype} 182 | top_id = make_uid() 183 | top_doc_solr["id"] = top_id 184 | path = [] 185 | reformat_to_solr_with_path(top_doc, top_doc_solr, top_id, top_doctype, path) 186 | print_flag = False 187 | data_solr.append(top_doc_solr) 188 | 189 | dump_to_json(data_solr, fout_nm) 190 | #eo convert_to_solr(fin_nm, fout_nm) 191 | 192 | def main(argv): 193 | inputfile_nm = '' 194 | outputfile_nm = '' 195 | fin = sys.stdin 196 | fout = sys.stdout 197 | 198 | try: 199 | opts, args = getopt.getopt(argv[1:],"hi:o:",["ifile=","ofile="]) 200 | except getopt.GetoptError: 201 | print 'Usage {script} -i -o '.format(script = argv[0]) 202 | sys.exit(2) 203 | for opt, arg in opts: 204 | if opt == '-h': 205 | print 'Usage {script} -i -o '.format(script = argv[0]) 206 | sys.exit() 207 | elif opt in ('-i', "--ifile"): 208 | inputfile_nm = arg 209 | elif opt in ("-o", "--ofile"): 210 | outputfile_nm = arg 211 | 212 | print 'Input file is "', inputfile_nm 213 | print 'Output file is "', outputfile_nm 214 | 215 | convert_to_solr(inputfile_nm, outputfile_nm) 216 | 217 | 218 | if __name__ == "__main__": 219 | main(sys.argv) 220 | -------------------------------------------------------------------------------- /data/example-data-solr.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "date": "2015-04-10T9:00:00Z", 4 | "path": "1.blog-posts", 5 | "_childDocuments_": [ 6 | { 7 | "path": "2.blog-posts.body", 8 | "_childDocuments_": [ 9 | { 10 | "path": "3.blog-posts.body.keywords", 11 | "type": "search engine", 12 | "id": "91701-21349", 13 | "text": "Solr" 14 | } 15 | ], 16 | "id": "91701-28526", 17 | "text": "Here I write how useful Solr is..." 18 | }, 19 | { 20 | "path": "2.blog-posts.title", 21 | "_childDocuments_": [ 22 | { 23 | "path": "3.blog-posts.title.keywords", 24 | "type": "search engine", 25 | "id": "91701-73494", 26 | "text": "Solr" 27 | }, 28 | { 29 | "path": "3.blog-posts.title.keywords", 30 | "type": "entity", 31 | "id": "91701-57714", 32 | "text": "Search Engine" 33 | } 34 | ], 35 | "id": "91701-69789", 36 | "text": "My Post #1: About Solr and Other Search Engines" 37 | }, 38 | { 39 | "sentiment": "positive", 40 | "author": "Bob", 41 | "text": "Great post about Solr", 42 | "_childDocuments_": [ 43 | { 44 | "path": "3.blog-posts.comments.keywords", 45 | "type": "search engine", 46 | "id": "91701-14067", 47 | "text": "Solr" 48 | }, 49 | { 50 | "sentiment": "positive", 51 | "author": "Dave", 52 | "text": "Yeah, I like Solr too", 53 | "_childDocuments_": [ 54 | { 55 | "path": "4.blog-posts.comments.replies.keywords", 56 | "type": "search engine", 57 | "id": "91701-19743", 58 | "text": "Solr" 59 | } 60 | ], 61 | "date": "2015-04-10T12:00:00Z", 62 | "path": "3.blog-posts.comments.replies", 63 | "id": "91701-11855" 64 | }, 65 | { 66 | "sentiment": "negative", 67 | "author": "Sri", 68 | "text": "I disagree, I prefer Elasticsearch", 69 | "_childDocuments_": [ 70 | { 71 | "path": "4.blog-posts.comments.replies.keywords", 72 | "type": "search engine", 73 | "id": "91701-17221", 74 | "text": "Elasticsearch" 75 | } 76 | ], 77 | "date": "2015-04-12T05:00:00Z", 78 | "path": "3.blog-posts.comments.replies", 79 | "id": "91701-27479" 80 | } 81 | ], 82 | "date": "2015-04-10T11:30:00Z", 83 | "path": "2.blog-posts.comments", 84 | "id": "91701-23917" 85 | } 86 | ], 87 | "id": "91701", 88 | "author": "Alice" 89 | }, 90 | { 91 | "date": "2015-11-10T9:00:00Z", 92 | "path": "1.blog-posts", 93 | "_childDocuments_": [ 94 | { 95 | "path": "2.blog-posts.body", 96 | "_childDocuments_": [ 97 | { 98 | "path": "3.blog-posts.body.keywords", 99 | "type": "search engine", 100 | "id": "24397-33681", 101 | "text": "Solr" 102 | } 103 | ], 104 | "id": "24397-18001", 105 | "text": "Here I also write how useful Solr is..." 106 | }, 107 | { 108 | "path": "2.blog-posts.title", 109 | "_childDocuments_": [ 110 | { 111 | "path": "3.blog-posts.title.keywords", 112 | "type": "search engine", 113 | "id": "24397-12687", 114 | "text": "Solr" 115 | }, 116 | { 117 | "path": "3.blog-posts.title.keywords", 118 | "type": "entity", 119 | "id": "24397-83428", 120 | "text": "feature" 121 | } 122 | ], 123 | "id": "24397-38220", 124 | "text": "About useful features of Solr" 125 | }, 126 | { 127 | "sentiment": "negative", 128 | "author": "Bob", 129 | "text": "You forgot that useful Solr's feature!", 130 | "_childDocuments_": [ 131 | { 132 | "path": "3.blog-posts.comments.keywords", 133 | "type": "search engine", 134 | "id": "24397-19289", 135 | "text": "Solr" 136 | }, 137 | { 138 | "path": "3.blog-posts.comments.keywords", 139 | "type": "entity", 140 | "id": "24397-27032", 141 | "text": "feature" 142 | }, 143 | { 144 | "sentiment": "neutral", 145 | "author": "Dave", 146 | "text": "But it only appeared in Solr 5.5, after the post was written", 147 | "_childDocuments_": [ 148 | { 149 | "path": "4.blog-posts.comments.replies.keywords", 150 | "type": "search engine", 151 | "id": "24397-15030", 152 | "text": "Solr" 153 | }, 154 | { 155 | "path": "4.blog-posts.comments.replies.keywords", 156 | "type": "search engine", 157 | "id": "24397-26659", 158 | "text": "Solr 5.5" 159 | } 160 | ], 161 | "date": "2016-04-10T12:00:00Z", 162 | "path": "3.blog-posts.comments.replies", 163 | "id": "24397-11623" 164 | } 165 | ], 166 | "date": "2016-04-10T11:30:00Z", 167 | "path": "2.blog-posts.comments", 168 | "id": "24397-22877" 169 | }, 170 | { 171 | "sentiment": "negative", 172 | "author": "Sri", 173 | "text": "Elasticsearch had it earlier than Solr", 174 | "_childDocuments_": [ 175 | { 176 | "path": "3.blog-posts.comments.keywords", 177 | "type": "search engine", 178 | "id": "24397-26016", 179 | "text": "Elasticsearch" 180 | }, 181 | { 182 | "path": "3.blog-posts.comments.keywords", 183 | "type": "search engine", 184 | "id": "24397-17834", 185 | "text": "Solr" 186 | } 187 | ], 188 | "date": "2015-12-12T05:00:00Z", 189 | "path": "2.blog-posts.comments", 190 | "id": "24397-20101" 191 | } 192 | ], 193 | "id": "24397", 194 | "author": "Aadit" 195 | } 196 | ] -------------------------------------------------------------------------------- /scripts/convert_data2solrjson_for_faceting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, os, getopt 4 | import json 5 | import codecs 6 | import uuid 7 | import time 8 | 9 | ############################### 10 | ##Sample of the output format: 11 | ## [ 12 | ## {id : book2, type_s:book, title_t : "Snow Crash", author_s : "Neal Stephenson", 13 | ## cat_s:sci-fi, pubyear_i:1992, publisher_s:Bantam, 14 | ## _childDocuments_ : [ 15 | ## { id: book2_c1, type_s:review, review_dt:"N15-01-03T14:30:00Z", 16 | ## stars_i:5, author_s:yonik, 17 | ## comment_t:"Ahead of its time... I wonder if it helped inspire The Matrix?", 18 | ## _childDocuments_:[ 19 | ## {id: book2_c1_e1, type_s:entity, text:"The Matrix", type:"movie" } 20 | ## ] 21 | ## }, ... 22 | ## ] 23 | ## 24 | ## 25 | ##################################### 26 | 27 | TYPE_FIELD_NAME = "path" 28 | 29 | 30 | def make_uid(): 31 | return str(uuid.uuid4().fields[-1])[:5] 32 | 33 | ## load from .json 34 | def load_from_json(fname): 35 | obj = {} 36 | # if file exists 37 | if os.path.isfile(fname): 38 | fin = codecs.open(fname, encoding = 'utf-8') 39 | obj = json.load(fin) 40 | else: 41 | print "No .json file found... Exiting" 42 | sys.stderr.write("No .json file found... Exiting\n") 43 | sys.exit(-1) 44 | return obj 45 | ##eof load_from_json() 46 | 47 | ## dump to .json 48 | def dump_to_json(obj, fname): 49 | fout = codecs.open(fname, encoding = 'utf-8', mode = 'w') 50 | json.dump(obj, fout, ensure_ascii = False, indent=4, separators=(',', ': ')) 51 | fout.close() 52 | ##eof dump_to_json() 53 | 54 | def path_to_str(path): 55 | path_str = "" 56 | level = len(path)+1 57 | for e in path: 58 | path_str+=e+"." 59 | return str(level)+"."+path_str 60 | 61 | 62 | #global var for tracking unique fields from upper levels 63 | 64 | unique_fields_map={} 65 | ## generates unique field and adda parent field for faceting 66 | def add_unique_and_parent_fields(d_solr, n, path, k, top_type): 67 | global unique_fields_map 68 | if top_type !="": 69 | uid = make_uid() 70 | unique_field_name = path_to_str(path)+k+"-id" 71 | d_solr["_childDocuments_"][n][unique_field_name] = uid 72 | #if it is on the 2nd level, update the map 73 | if unique_field_name.find("2.blog-posts.") > -1: 74 | unique_fields_map[unique_field_name] = uid 75 | #else, propagate the unique field for the branch stemming from level 2 76 | else: 77 | second_level_part = unique_field_name[1: unique_field_name.find(".", 13)] 78 | second_level_unique_filed_name = "2"+second_level_part+"-id" 79 | d_solr["_childDocuments_"][n][second_level_unique_filed_name] = unique_fields_map[second_level_unique_filed_name] 80 | else: 81 | unique_fields_map = {} 82 | 83 | 84 | 85 | print_flag = False 86 | N = 10 87 | def reformat_to_solr_with_path(d_original, d_solr, top_id, top_type, path): 88 | #print "top_type = ", top_type 89 | 90 | path.append(top_type) 91 | if len(path) < N and print_flag: 92 | print path 93 | 94 | 95 | for k, v in d_original.iteritems(): 96 | #for dicts, i.e., objects 97 | if path[-1] in d_original.keys() and path[-1] != "text": 98 | popped = path.pop() 99 | if len(path) < N and print_flag: 100 | print "popped at the beginning =", popped 101 | 102 | if isinstance(v, dict): 103 | #print "k = ", k 104 | #print k, " : dict" 105 | if "_childDocuments_" in d_solr: 106 | if top_type !="": 107 | #d_solr["_childDocuments_"].append({"type_s":top_type+"."+k}) 108 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 109 | #top_type = top_type+"."+k 110 | else: 111 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 112 | n = len(d_solr["_childDocuments_"]) 113 | uid = make_uid() 114 | d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid 115 | add_unique_and_parent_fields(d_solr, n-1, path, k, top_type) 116 | 117 | reformat_to_solr_with_path(v, d_solr["_childDocuments_"][n-1], top_id, k, path) 118 | else: 119 | d_solr["_childDocuments_"] = [] 120 | if top_type !="": 121 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 122 | else: 123 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 124 | uid = make_uid() 125 | d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid 126 | add_unique_and_parent_fields(d_solr, 0, path, k, top_type) 127 | reformat_to_solr_with_path(v, d_solr["_childDocuments_"][0], top_id, k, path) 128 | 129 | if path[-1] in v.keys(): 130 | popped = path.pop() 131 | if len(path) < N and print_flag: 132 | print "popped child at the end =", popped 133 | if path[-1] == k : 134 | popped = path.pop() 135 | if len(path) < N and print_flag: 136 | print "popped k at the end =", popped 137 | 138 | #for list of dicts, i.e., objects 139 | elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict): 140 | for d_obj in v: 141 | if path[-1] == k: 142 | popped = path.pop() 143 | if len(path) < N and print_flag: 144 | print "popped on cycle =", popped 145 | 146 | if "_childDocuments_" in d_solr: 147 | if top_type !="": 148 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 149 | else: 150 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 151 | n = len(d_solr["_childDocuments_"]) 152 | uid = make_uid() 153 | d_solr["_childDocuments_"][n-1]["id"] = top_id+"-"+uid 154 | add_unique_and_parent_fields(d_solr, n-1, path, k, top_type) 155 | reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][n-1], top_id, k, path) 156 | else: 157 | d_solr["_childDocuments_"] = [] 158 | if top_type !="": 159 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:path_to_str(path)+k}) 160 | else: 161 | d_solr["_childDocuments_"].append({TYPE_FIELD_NAME:k}) 162 | uid = make_uid() 163 | d_solr["_childDocuments_"][0]["id"] = top_id+"-"+uid 164 | add_unique_and_parent_fields(d_solr, 0, path, k, top_type) 165 | reformat_to_solr_with_path(d_obj, d_solr["_childDocuments_"][0], top_id, k, path) 166 | 167 | else: 168 | # PREPROCESSING OF THE LEAF FIELDS # 169 | 170 | #adds "raw" (not tokenized) field just for the "text" field 171 | # only for Query 2 172 | #if k == "text": 173 | # k1 = "text_s" 174 | 175 | #conversion of date 176 | #if k == "date": 177 | ## Date in Solr: 178 | ## 1972-05-NT17:33:18Z 179 | #k = "date_tdt" 180 | #v = long(v) 181 | #v = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(v/1000.)) 182 | #elif k == "path": 183 | #k = k+"_s" 184 | #else: 185 | #add "raw" field for every field 186 | #k1 = k+"_ss" 187 | #d_solr[k1] = v 188 | 189 | #adding "_t_s" suffix 190 | #for further conversion into *_t and *_s through CopyField rules 191 | #k = k+"_ts" 192 | 193 | d_solr[k] = v 194 | #print "{0} : {1}".format(k, v) 195 | #eo reformat_to_solr_with_path 196 | 197 | 198 | 199 | #wraps the format conversion process 200 | def convert_to_solr(fin_nm, fout_nm): 201 | obj = load_from_json(fin_nm) 202 | 203 | #types of top-level documents 204 | top_doctypes = obj.keys() 205 | 206 | data_solr = [] 207 | for top_doctype in top_doctypes: 208 | for top_doc in obj[top_doctype]: 209 | 210 | top_doc_solr = {TYPE_FIELD_NAME:"1."+top_doctype} 211 | top_id = make_uid() 212 | top_doc_solr["id"] = top_id 213 | path = [] 214 | reformat_to_solr_with_path(top_doc, top_doc_solr, top_id, top_doctype, path) 215 | print_flag = False 216 | data_solr.append(top_doc_solr) 217 | 218 | dump_to_json(data_solr, fout_nm) 219 | #eo convert_to_solr(fin_nm, fout_nm) 220 | 221 | def main(argv): 222 | inputfile_nm = '' 223 | outputfile_nm = '' 224 | fin = sys.stdin 225 | fout = sys.stdout 226 | 227 | try: 228 | opts, args = getopt.getopt(argv[1:],"hi:o:",["ifile=","ofile="]) 229 | except getopt.GetoptError: 230 | print 'Usage {script} -i -o '.format(script = argv[0]) 231 | sys.exit(2) 232 | for opt, arg in opts: 233 | if opt == '-h': 234 | print 'Usage {script} -i -o '.format(script = argv[0]) 235 | sys.exit() 236 | elif opt in ('-i', "--ifile"): 237 | inputfile_nm = arg 238 | elif opt in ("-o", "--ofile"): 239 | outputfile_nm = arg 240 | 241 | print 'Input file is "', inputfile_nm 242 | print 'Output file is "', outputfile_nm 243 | 244 | convert_to_solr(inputfile_nm, outputfile_nm) 245 | 246 | 247 | if __name__ == "__main__": 248 | main(sys.argv) 249 | -------------------------------------------------------------------------------- /data/example-data-solr-for-faceting.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "date": "2015-04-10T9:00:00Z", 4 | "path": "1.blog-posts", 5 | "_childDocuments_": [ 6 | { 7 | "path": "2.blog-posts.body", 8 | "2.blog-posts.body-id": "50224", 9 | "id": "17252-18262", 10 | "_childDocuments_": [ 11 | { 12 | "2.blog-posts.body-id": "50224", 13 | "text": "Solr", 14 | "3.blog-posts.body.keywords-id": "23072", 15 | "path": "3.blog-posts.body.keywords", 16 | "type": "search engine", 17 | "id": "17252-67394" 18 | } 19 | ], 20 | "text": "Here I write how useful Solr is..." 21 | }, 22 | { 23 | "path": "2.blog-posts.title", 24 | "_childDocuments_": [ 25 | { 26 | "text": "Solr", 27 | "3.blog-posts.title.keywords-id": "16242", 28 | "2.blog-posts.title-id": "62235", 29 | "path": "3.blog-posts.title.keywords", 30 | "type": "search engine", 31 | "id": "17252-18776" 32 | }, 33 | { 34 | "text": "Search Engine", 35 | "3.blog-posts.title.keywords-id": "37837", 36 | "2.blog-posts.title-id": "62235", 37 | "path": "3.blog-posts.title.keywords", 38 | "type": "entity", 39 | "id": "17252-52565" 40 | } 41 | ], 42 | "text": "My Post #1: About Solr and Other Search Engines", 43 | "id": "17252-18960", 44 | "2.blog-posts.title-id": "62235" 45 | }, 46 | { 47 | "sentiment": "positive", 48 | "author": "Bob", 49 | "text": "Great post about Solr", 50 | "_childDocuments_": [ 51 | { 52 | "text": "Solr", 53 | "2.blog-posts.comments-id": "32759", 54 | "path": "3.blog-posts.comments.keywords", 55 | "3.blog-posts.comments.keywords-id": "70412", 56 | "type": "search engine", 57 | "id": "17252-19514" 58 | }, 59 | { 60 | "sentiment": "positive", 61 | "author": "Dave", 62 | "text": "Yeah, I like Solr too", 63 | "_childDocuments_": [ 64 | { 65 | "text": "Solr", 66 | "4.blog-posts.comments.replies.keywords-id": "14028", 67 | "2.blog-posts.comments-id": "32759", 68 | "path": "4.blog-posts.comments.replies.keywords", 69 | "type": "search engine", 70 | "id": "17252-18477" 71 | } 72 | ], 73 | "2.blog-posts.comments-id": "32759", 74 | "3.blog-posts.comments.replies-id": "63952", 75 | "date": "2015-04-10T12:00:00Z", 76 | "path": "3.blog-posts.comments.replies", 77 | "id": "17252-53695" 78 | }, 79 | { 80 | "sentiment": "negative", 81 | "author": "Sri", 82 | "text": "I disagree, I prefer Elasticsearch", 83 | "_childDocuments_": [ 84 | { 85 | "text": "Elasticsearch", 86 | "4.blog-posts.comments.replies.keywords-id": "28848", 87 | "2.blog-posts.comments-id": "32759", 88 | "path": "4.blog-posts.comments.replies.keywords", 89 | "type": "search engine", 90 | "id": "17252-13445" 91 | } 92 | ], 93 | "2.blog-posts.comments-id": "32759", 94 | "3.blog-posts.comments.replies-id": "80429", 95 | "date": "2015-04-12T05:00:00Z", 96 | "path": "3.blog-posts.comments.replies", 97 | "id": "17252-19383" 98 | } 99 | ], 100 | "2.blog-posts.comments-id": "32759", 101 | "date": "2015-04-10T11:30:00Z", 102 | "path": "2.blog-posts.comments", 103 | "id": "17252-24737" 104 | } 105 | ], 106 | "id": "17252", 107 | "author": "Alice" 108 | }, 109 | { 110 | "date": "2015-11-10T9:00:00Z", 111 | "path": "1.blog-posts", 112 | "_childDocuments_": [ 113 | { 114 | "path": "2.blog-posts.body", 115 | "2.blog-posts.body-id": "63622", 116 | "id": "24220-20881", 117 | "_childDocuments_": [ 118 | { 119 | "2.blog-posts.body-id": "63622", 120 | "text": "Solr", 121 | "3.blog-posts.body.keywords-id": "14615", 122 | "path": "3.blog-posts.body.keywords", 123 | "type": "search engine", 124 | "id": "24220-94803" 125 | } 126 | ], 127 | "text": "Here I also write how useful Solr is..." 128 | }, 129 | { 130 | "path": "2.blog-posts.title", 131 | "_childDocuments_": [ 132 | { 133 | "text": "Solr", 134 | "3.blog-posts.title.keywords-id": "10276", 135 | "2.blog-posts.title-id": "82579", 136 | "path": "3.blog-posts.title.keywords", 137 | "type": "search engine", 138 | "id": "24220-11828" 139 | }, 140 | { 141 | "text": "feature", 142 | "3.blog-posts.title.keywords-id": "26821", 143 | "2.blog-posts.title-id": "82579", 144 | "path": "3.blog-posts.title.keywords", 145 | "type": "entity", 146 | "id": "24220-68954" 147 | } 148 | ], 149 | "text": "About useful features of Solr", 150 | "id": "24220-21378", 151 | "2.blog-posts.title-id": "82579" 152 | }, 153 | { 154 | "sentiment": "negative", 155 | "author": "Bob", 156 | "text": "You forgot that useful Solr's feature!", 157 | "_childDocuments_": [ 158 | { 159 | "text": "Solr", 160 | "2.blog-posts.comments-id": "69776", 161 | "path": "3.blog-posts.comments.keywords", 162 | "3.blog-posts.comments.keywords-id": "25182", 163 | "type": "search engine", 164 | "id": "24220-17760" 165 | }, 166 | { 167 | "text": "feature", 168 | "2.blog-posts.comments-id": "69776", 169 | "path": "3.blog-posts.comments.keywords", 170 | "3.blog-posts.comments.keywords-id": "58954", 171 | "type": "entity", 172 | "id": "24220-26183" 173 | }, 174 | { 175 | "sentiment": "neutral", 176 | "author": "Dave", 177 | "text": "But it only appeared in Solr 5.5, after the post was written", 178 | "_childDocuments_": [ 179 | { 180 | "text": "Solr", 181 | "4.blog-posts.comments.replies.keywords-id": "83968", 182 | "2.blog-posts.comments-id": "69776", 183 | "path": "4.blog-posts.comments.replies.keywords", 184 | "type": "search engine", 185 | "id": "24220-20038" 186 | }, 187 | { 188 | "text": "Solr 5.5", 189 | "4.blog-posts.comments.replies.keywords-id": "24056", 190 | "2.blog-posts.comments-id": "69776", 191 | "path": "4.blog-posts.comments.replies.keywords", 192 | "type": "search engine", 193 | "id": "24220-19125" 194 | } 195 | ], 196 | "2.blog-posts.comments-id": "69776", 197 | "3.blog-posts.comments.replies-id": "24390", 198 | "date": "2016-04-10T12:00:00Z", 199 | "path": "3.blog-posts.comments.replies", 200 | "id": "24220-16949" 201 | } 202 | ], 203 | "2.blog-posts.comments-id": "69776", 204 | "date": "2016-04-10T11:30:00Z", 205 | "path": "2.blog-posts.comments", 206 | "id": "24220-26951" 207 | }, 208 | { 209 | "sentiment": "negative", 210 | "author": "Sri", 211 | "text": "Elasticsearch had it earlier than Solr", 212 | "_childDocuments_": [ 213 | { 214 | "text": "Elasticsearch", 215 | "2.blog-posts.comments-id": "18323", 216 | "path": "3.blog-posts.comments.keywords", 217 | "3.blog-posts.comments.keywords-id": "12980", 218 | "type": "search engine", 219 | "id": "24220-52064" 220 | }, 221 | { 222 | "text": "Solr", 223 | "2.blog-posts.comments-id": "18323", 224 | "path": "3.blog-posts.comments.keywords", 225 | "3.blog-posts.comments.keywords-id": "24594", 226 | "type": "search engine", 227 | "id": "24220-14872" 228 | } 229 | ], 230 | "2.blog-posts.comments-id": "18323", 231 | "date": "2015-12-12T05:00:00Z", 232 | "path": "2.blog-posts.comments", 233 | "id": "24220-20919" 234 | } 235 | ], 236 | "id": "24220", 237 | "author": "Aadit" 238 | } 239 | ] 240 | --------------------------------------------------------------------------------