├── README.md
└── converter.py


/README.md:
--------------------------------------------------------------------------------
1 | # quranic-corpus
2 | 


--------------------------------------------------------------------------------
/converter.py:
--------------------------------------------------------------------------------
 1 | import os, re
 2 | import json
 3 | import pdb
 4 | import collections
 5 | from django.utils.text import slugify
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | omitted_dirs = []
 9 | sourceLink = 'http://tanzil.info'
10 | source = 'Tanzil Quran Text'
11 | works = []
12 | 
13 | def jaggedListToDict(text):
14 | 	node = { str(i): t for i, t in enumerate(text) }
15 | 	node = collections.OrderedDict(sorted(node.items(), key=lambda k: int(k[0])))
16 | 	for child in node:
17 | 		if isinstance(node[child], list):
18 | 			if len(node[child]) == 1:
19 | 				node[child] = node[child][0]
20 | 			else:
21 | 				node[child] = jaggedListToDict(node[child])
22 | 	return node
23 | 
24 | 
25 | def main():
26 | 	if not os.path.exists('cltk_json'):
27 | 		os.makedirs('cltk_json')
28 | 
29 | 	for root, dirs, files in os.walk("."):
30 | 		path = root.split('/')
31 | 		print((len(path) - 1) * '---', os.path.basename(root))
32 | 		for fname in files:
33 | 			if fname.endswith('xml'):
34 | 				with open(os.path.join(root, fname)) as f:
35 | 					soup = BeautifulSoup(f.read(), 'html.parser')
36 | 
37 | 				work = {
38 | 					'originalTitle': 'القرآن‎‎',
39 | 					'englishTitle': 'The Holy Quran',
40 | 					'author': '(Original Book)',
41 | 					'source': source,
42 | 					'sourceLink': sourceLink,
43 | 					'language': 'arabic',
44 | 					'text': {},
45 | 				}
46 | 
47 | 				text = []
48 | 				chapters = soup.findAll('chapter')
49 | 				for i, chapter in enumerate(chapters):
50 | 					text.append([])
51 | 					verses = chapter.findAll('verse')
52 | 					for verse in verses:
53 | 						text[i].append(verse.text)
54 | 
55 | 
56 | 				work['text'] = jaggedListToDict(text)
57 | 				fname = slugify(work['source']) + '__' + slugify(work['englishTitle'][0:140]) + '__' + slugify(work['language']) + '.json'
58 | 				fname = fname.replace(" ", "")
59 | 
60 | 				with open('cltk_json/' + fname, 'w') as f:
61 | 					json.dump(work, f)
62 | 
63 | if __name__ == '__main__':
64 | 	main()
65 | 


--------------------------------------------------------------------------------