├── README.md └── converter.py /README.md: -------------------------------------------------------------------------------- 1 | # quranic-corpus 2 | -------------------------------------------------------------------------------- /converter.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | import json 3 | import pdb 4 | import collections 5 | from django.utils.text import slugify 6 | from bs4 import BeautifulSoup 7 | 8 | omitted_dirs = [] 9 | sourceLink = 'http://tanzil.info' 10 | source = 'Tanzil Quran Text' 11 | works = [] 12 | 13 | def jaggedListToDict(text): 14 | node = { str(i): t for i, t in enumerate(text) } 15 | node = collections.OrderedDict(sorted(node.items(), key=lambda k: int(k[0]))) 16 | for child in node: 17 | if isinstance(node[child], list): 18 | if len(node[child]) == 1: 19 | node[child] = node[child][0] 20 | else: 21 | node[child] = jaggedListToDict(node[child]) 22 | return node 23 | 24 | 25 | def main(): 26 | if not os.path.exists('cltk_json'): 27 | os.makedirs('cltk_json') 28 | 29 | for root, dirs, files in os.walk("."): 30 | path = root.split('/') 31 | print((len(path) - 1) * '---', os.path.basename(root)) 32 | for fname in files: 33 | if fname.endswith('xml'): 34 | with open(os.path.join(root, fname)) as f: 35 | soup = BeautifulSoup(f.read(), 'html.parser') 36 | 37 | work = { 38 | 'originalTitle': 'القرآن‎‎', 39 | 'englishTitle': 'The Holy Quran', 40 | 'author': '(Original Book)', 41 | 'source': source, 42 | 'sourceLink': sourceLink, 43 | 'language': 'arabic', 44 | 'text': {}, 45 | } 46 | 47 | text = [] 48 | chapters = soup.findAll('chapter') 49 | for i, chapter in enumerate(chapters): 50 | text.append([]) 51 | verses = chapter.findAll('verse') 52 | for verse in verses: 53 | text[i].append(verse.text) 54 | 55 | 56 | work['text'] = jaggedListToDict(text) 57 | fname = slugify(work['source']) + '__' + slugify(work['englishTitle'][0:140]) + '__' + slugify(work['language']) + '.json' 58 | fname = fname.replace(" ", "") 59 | 60 | with open('cltk_json/' + fname, 'w') as f: 61 | json.dump(work, f) 62 | 63 | if __name__ == '__main__': 64 | main() 65 | --------------------------------------------------------------------------------