├── index.sh ├── Readme.md ├── stopwords.txt ├── k-way-merge.py ├── search.py └── myWikiIndexer.py /index.sh: -------------------------------------------------------------------------------- 1 | python myWikiIndexer.py $1 $2 2 | python k-way-merge.py -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Wikipedia Search Engine 2 | 3 | This is a search engine built on the full English corpus of wikipedia (~75GB) 4 | 5 | ## Performance 6 | 7 | ### For Queries of 8 | 9 | 1. less than **3** words, time to fetch results is **< 1s** 10 | 2. between **3 and 7** words, time to fetch results is **Around 5s** 11 | 12 | ## Code Files 13 | 14 | 1. **myWikiIndexer.py** - File containing all functions related to XML parsing and text preprocessing, the code for indexing. 15 | 2. **k-way-merge.py** - File with functions related to k-way mergesort algorithm and creates secondary index. 16 | 3. **search.py** - Main file containing all the code for Query Processing 17 | 18 | ## Execution of Code 19 | 20 | ### Prerequisits 21 | 22 | #### Required Directories 23 | 24 | 1. **index** - Initial index gets created here 25 | 2. **finalIndex** - They get merged here and also stored secondary index in this directory 26 | 27 | #### Required Files 28 | 29 | 1. **stopwords.txt** - A txt file containing all the stop words in the current directory of the code 30 | 2. **wiki_dump.xml** - The XML file containing the full data of wikipedia 31 | 32 | ### Execution 33 | 34 | 1. Run **myWikiIndexer.py** with path to dump and folder to index as command line arguments. 35 | 2. Run **k-way-merge.py** - Will sort the index and create secondary index 36 | 3. Run **search.py** - An infinite loop runs expecting queries. 37 | 38 | ### Types of Queries 39 | 40 | 1. **Normal query** - Any sequence of words that doesn’t satisfy the above conditions is considered a normal query eg: “Sachin Tendulkar” 41 | 42 | 2. **Field query** - Assuming that fields are small letters(b, i, c, t, r, e) followed by colon and the fields are space separated. eg: “body:sachin infobox:2003 category:sports” 43 | -------------------------------------------------------------------------------- /stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | you're 11 | you've 12 | you'll 13 | you'd 14 | your 15 | yours 16 | yourself 17 | yourselves 18 | he 19 | him 20 | his 21 | himself 22 | she 23 | she's 24 | her 25 | hers 26 | herself 27 | it 28 | it's 29 | its 30 | itself 31 | they 32 | them 33 | their 34 | theirs 35 | themselves 36 | what 37 | which 38 | who 39 | whom 40 | this 41 | that 42 | that'll 43 | these 44 | those 45 | am 46 | is 47 | are 48 | was 49 | were 50 | be 51 | been 52 | being 53 | have 54 | has 55 | had 56 | having 57 | do 58 | does 59 | did 60 | doing 61 | a 62 | an 63 | the 64 | and 65 | but 66 | if 67 | or 68 | because 69 | as 70 | until 71 | while 72 | of 73 | at 74 | by 75 | for 76 | with 77 | about 78 | against 79 | between 80 | into 81 | through 82 | during 83 | before 84 | after 85 | above 86 | below 87 | to 88 | from 89 | up 90 | down 91 | in 92 | out 93 | on 94 | off 95 | over 96 | under 97 | again 98 | further 99 | then 100 | once 101 | here 102 | there 103 | when 104 | where 105 | why 106 | how 107 | all 108 | any 109 | both 110 | each 111 | few 112 | more 113 | most 114 | other 115 | some 116 | such 117 | no 118 | nor 119 | not 120 | only 121 | own 122 | same 123 | so 124 | than 125 | too 126 | very 127 | s 128 | t 129 | can 130 | will 131 | just 132 | don 133 | don't 134 | should 135 | should've 136 | now 137 | d 138 | ll 139 | m 140 | o 141 | re 142 | ve 143 | y 144 | ain 145 | aren 146 | aren't 147 | couldn 148 | couldn't 149 | didn 150 | didn't 151 | doesn 152 | doesn't 153 | hadn 154 | hadn't 155 | hasn 156 | hasn't 157 | haven 158 | haven't 159 | isn 160 | isn't 161 | ma 162 | mightn 163 | mightn't 164 | mustn 165 | mustn't 166 | needn 167 | needn't 168 | shan 169 | shan't 170 | shouldn 171 | shouldn't 172 | wasn 173 | wasn't 174 | weren 175 | weren't 176 | won 177 | won't 178 | wouldn 179 | wouldn't 180 | -------------------------------------------------------------------------------- /k-way-merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import os 5 | import timeit 6 | from glob import glob 7 | from collections import defaultdict 8 | from heapq import heapify, heappush, heappop 9 | 10 | 11 | splittedIndexFolder = './index' 12 | mergedIndexFolder = './finalIndex' 13 | 14 | numberOfMergedIndexfile = 0 15 | chunkSize = 5000 16 | secondaryIndex = defaultdict() 17 | invertedIndex = defaultdict() 18 | splittedFilePathList = glob(splittedIndexFolder + '/*') 19 | numOfSplittedFiles = len(splittedFilePathList) 20 | processedFiles = [0 for _ in range(numOfSplittedFiles)] 21 | filePointers = dict() 22 | currentRowofFile = dict() 23 | kWayHeap = list() 24 | termDict = dict() 25 | total = 0 26 | 27 | start = timeit.default_timer() 28 | 29 | 30 | def writeIndextofile(): 31 | global numberOfMergedIndexfile 32 | numberOfMergedIndexfile += 1 33 | fileName = mergedIndexFolder + '/index' \ 34 | + str(numberOfMergedIndexfile) + '.txt' 35 | firstWord = True 36 | with open(fileName, 'w') as fp: 37 | for i in sorted(invertedIndex): 38 | if firstWord: 39 | secondaryIndex[i] = numberOfMergedIndexfile 40 | firstWord = False 41 | fp.write(str(i) + '=' + invertedIndex[i] + '\n') 42 | 43 | 44 | def writeSecondaryIndex(): 45 | fileName = mergedIndexFolder + '/secondaryIndex.txt' 46 | with open(fileName, 'w') as fp: 47 | for i in sorted(secondaryIndex): 48 | fp.write(str(i) + '\n') 49 | 50 | 51 | def writePrimaryIndex(): 52 | global numberOfMergedIndexfile 53 | numberOfMergedIndexfile += 1 54 | fileName = mergedIndexFolder + '/index' \ 55 | + str(numberOfMergedIndexfile) + '.txt' 56 | firstWord = True 57 | with open(fileName, 'w') as fp: 58 | for i in sorted(invertedIndex): 59 | if firstWord: 60 | secondaryIndex[i] = numberOfMergedIndexfile 61 | firstWord = False 62 | fp.write(str(i) + '=' + invertedIndex[i] + '\n') 63 | 64 | 65 | def kWayMerge(): 66 | global total 67 | for i in range(numOfSplittedFiles): 68 | processedFiles[i] = 1 69 | try: 70 | filePointers[i] = open(splittedFilePathList[i], 'r') 71 | except: 72 | pass 73 | currentRowofFile[i] = filePointers[i].readline() 74 | termDict[i] = currentRowofFile[i].strip().split('=') 75 | if termDict[i][0] not in kWayHeap: 76 | heappush(kWayHeap, termDict[i][0]) 77 | 78 | while True: 79 | if processedFiles.count(0) == numOfSplittedFiles: 80 | break 81 | else: 82 | total += 1 83 | word = heappop(kWayHeap) 84 | for i in range(numOfSplittedFiles): 85 | if processedFiles[i] and termDict[i][0] == word: 86 | if word not in invertedIndex: 87 | invertedIndex[word] = termDict[i][1] 88 | else: 89 | invertedIndex[word] += ',' + termDict[i][1] 90 | 91 | currentRowofFile[i] = \ 92 | filePointers[i].readline().strip() 93 | 94 | if currentRowofFile[i]: 95 | termDict[i] = currentRowofFile[i].split('=') 96 | if termDict[i][0] not in kWayHeap: 97 | heappush(kWayHeap, termDict[i][0]) 98 | else: 99 | processedFiles[i] = 0 100 | filePointers[i].close() 101 | os.remove(splittedFilePathList[i]) 102 | if total >= chunkSize: 103 | total = 0 104 | writePrimaryIndex() 105 | invertedIndex.clear() 106 | 107 | 108 | kWayMerge() 109 | writePrimaryIndex() 110 | writeSecondaryIndex() 111 | stop = timeit.default_timer() 112 | print(stop - start) 113 | -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import re 6 | import timeit 7 | from collections import defaultdict 8 | from operator import itemgetter 9 | from nltk.stem import PorterStemmer 10 | from bisect import bisect 11 | from math import log10 12 | 13 | ps = PorterStemmer() 14 | noDocs = 0 15 | docToTitle = dict() 16 | stopWords = set() 17 | secondaryIndex = list() 18 | invertedIndex = defaultdict(lambda : defaultdict(lambda : \ 19 | defaultdict(int))) 20 | fieldDict = { 21 | 'title': 't', 22 | 'body': 'b', 23 | 'infobox': 'i', 24 | 'category': 'c', 25 | 'ref': 'r', 26 | 'ext': 'e', 27 | } 28 | weight = { 29 | 't': 500, 30 | 'b': 1, 31 | 'i': 50, 32 | 'c': 50, 33 | 'r': 50, 34 | 'e': 50, 35 | } 36 | fields = ['title:', 'body:', 'infobox:', 'category:', 'ref:'] 37 | 38 | 39 | def readDocTitleMap(): 40 | global docToTitle, noDocs 41 | with open('./docToTitle.txt', 'r') as f: 42 | for line in f: 43 | (docID, titleMap) = line.split('#') 44 | docToTitle[docID] = titleMap 45 | noDocs += 1 46 | 47 | 48 | def readStopwords(): 49 | global stopWords 50 | try: 51 | f = open('stopwords.txt', 'r') 52 | for line in f: 53 | stopWords.add(line.strip()) 54 | except: 55 | print("Can't find stopwords.txt") 56 | sys.exit(1) 57 | 58 | 59 | def readSecondaryIndex(): 60 | global secondaryIndex 61 | try: 62 | f = open('finalIndex/secondaryIndex.txt', 'r') 63 | for line in f: 64 | secondaryIndex.append(line.split()[0]) 65 | except: 66 | print("Can't find the secondary index file in 'finalIndex' Folder.") 67 | sys.exit(1) 68 | 69 | 70 | def processIndex(): 71 | fileName = '/index' + str() + '.txt' 72 | firstWord = True 73 | with open(fileName, 'w') as fp: 74 | for i in sorted(invertedIndex): 75 | if firstWord: 76 | secondaryIndex[i] = 1 77 | firstWord = False 78 | fp.write(str(i) + '=' + invertedIndex[i] + '\n') 79 | 80 | 81 | def cleanText(text): 82 | 83 | # Regular Expression to remove {{cite **}} or {{vcite **}} 84 | 85 | reg = re.compile(r'{{v?cite(.*?)}}', re.DOTALL) 86 | text = reg.sub('', text) 87 | 88 | # Regular Expression to remove Punctuation 89 | 90 | reg = re.compile(r'[.,;_()"/\']', re.DOTALL) 91 | text = reg.sub(' ', text) 92 | 93 | # Regular Expression to remove [[file:]] 94 | 95 | reg = re.compile(r'\[\[file:(.*?)\]\]', re.DOTALL) 96 | text = reg.sub('', text) 97 | 98 | # Regular Expression to remove <..> tags from text 99 | 100 | reg = re.compile(r'<(.*?)>', re.DOTALL) 101 | text = reg.sub('', text) 102 | 103 | # Regular Expression to remove non ASCII char 104 | 105 | reg = re.compile(r'[^\x00-\x7F]+', re.DOTALL) 106 | text = reg.sub(' ', text) 107 | return text 108 | 109 | 110 | def getFileNumber(word): 111 | position = bisect(secondaryIndex, word) 112 | if position - 1 >= 0 and secondaryIndex[position - 1] == word: 113 | if position - 1 != 0: 114 | position -= 1 115 | if position + 1 == len(secondaryIndex) \ 116 | and secondaryIndex[position] == word: 117 | position += 1 118 | return position 119 | 120 | 121 | def getPostingList(word): 122 | position = getFileNumber(word) 123 | primaryFile = 'finalIndex/index' + str(position) + '.txt' 124 | file = open(primaryFile, 'r') 125 | data = file.readlines() 126 | low = 0 127 | high = len(data) 128 | mid = int() 129 | while low <= high: 130 | mid = int(low + (high - low) / 2) 131 | cur = data[mid].split('=')[0] 132 | if cur == word: 133 | break 134 | elif cur < word: 135 | low = mid + 1 136 | else: 137 | high = mid - 1 138 | return data[mid].split('=')[1].split(',') 139 | 140 | 141 | def printResult(lengthFreq): 142 | lengthFreq = sorted(lengthFreq.items(), key=lambda item: item[1], reverse=True)[0:10] 143 | for tup in lengthFreq: 144 | (docId, _) = tup 145 | print("=> ", docToTitle[docId], end='') 146 | 147 | 148 | def parseQuery(queryText, isFieldQuery): 149 | wordRegEx = re.compile(r'[\ \.\-\:\&\$\!\*\+\%\,\@]+', re.DOTALL) 150 | if isFieldQuery: 151 | fieldQList = list() 152 | queryList = queryText.split() 153 | for word in queryList: 154 | if ':' not in word: 155 | word = 'body:' + word 156 | (cat, content) = word.split(':') 157 | content = cleanText(content) 158 | content = ps.stem(content) 159 | content = wordRegEx.sub('', content) 160 | if len(content) > 0 and content.isalnum and content \ 161 | not in stopWords: 162 | fieldQList.append((content, fieldDict[cat])) 163 | finalDict = defaultdict(int) 164 | for (word, category) in fieldQList: 165 | postingListAll = getPostingList(word) 166 | postingList = [i for i in postingListAll if category in i] 167 | if len(postingList) < 2: 168 | postingList = postingListAll 169 | numDoc = len(postingList) 170 | idf = log10(noDocs / numDoc) 171 | for pl in postingList: 172 | docId, freqList = pl.split(":") 173 | categoryFreq = freqList.split("#") 174 | tf = 0 175 | for cf in categoryFreq: 176 | cat = cf[0] 177 | freq = int(cf[1:]) 178 | tf += (freq * weight[cat]) 179 | finalDict[docId] += float(log10(1 + tf)) * float(idf) 180 | else: 181 | 182 | queryText = cleanText(queryText) 183 | tokenList = queryText.split(' ') 184 | tokenList = [wordRegEx.sub('', i) for i in tokenList] 185 | finalTokens = list() 186 | for tok in tokenList: 187 | val = ps.stem(tok) 188 | if len(val) > 0 and val.isalnum and val not in stopWords: 189 | finalTokens.append(val) 190 | finalDict = defaultdict(int) 191 | for word in finalTokens: 192 | postingList = getPostingList(word) 193 | numDoc = len(postingList) 194 | idf = log10(noDocs / numDoc) 195 | for pl in postingList: 196 | docId, freqList = pl.split(":") 197 | categoryFreq = freqList.split("#") 198 | tf = 0 199 | for cf in categoryFreq: 200 | cat = cf[0] 201 | freq = int(cf[1:]) 202 | tf += (freq * weight[cat]) 203 | finalDict[docId] += float(log10(1 + tf)) * float(idf) 204 | printResult(finalDict) 205 | 206 | 207 | def search(path_to_index, query): 208 | global fields 209 | isFieldQuery = False 210 | for f in fields: 211 | if f in query: 212 | isFieldQuery = True 213 | break 214 | parseQuery(query, isFieldQuery) 215 | 216 | 217 | def main(): 218 | path_to_index = './finalIndex/' 219 | while True: 220 | query = input('\nEnter Query: ') 221 | print('+++++++++++++++++++++++++++++++++++') 222 | start = timeit.default_timer() 223 | search(path_to_index, query) 224 | end = timeit.default_timer() 225 | print('\nTook', end - start, 'sec\n') 226 | print('+++++++++++++++++++++++++++++++++++') 227 | 228 | 229 | if __name__ == '__main__': 230 | print ('reading DoctitleMap') 231 | readDocTitleMap() 232 | print ('reading secondary Index') 233 | readSecondaryIndex() 234 | readStopwords() 235 | try: 236 | main() 237 | except: 238 | print ('''\n\nThank You..\n''') 239 | -------------------------------------------------------------------------------- /myWikiIndexer.py: -------------------------------------------------------------------------------- 1 | # startting phase 2 2 | import sys 3 | import timeit 4 | import re 5 | import spacy 6 | from xml.sax import parse 7 | from xml.sax import ContentHandler 8 | from collections import defaultdict 9 | from nltk.stem import PorterStemmer 10 | 11 | 12 | ps = PorterStemmer() 13 | stemmingMap = dict() 14 | fileLim = 25000 15 | dumpFile = sys.argv[1] 16 | path_to_index = sys.argv[2] 17 | 18 | if len(sys.argv) != 3: 19 | print("Arguments invalid") 20 | print("Run using : bash index.sh ") 21 | sys.exit(1) 22 | 23 | documentTitleMapping = open("./docToTitle.txt", "w") 24 | 25 | ''' 26 | Dictionary structure 27 | { 28 | word : { 29 | docID :{ 30 | t1 : cnt1, 31 | t2 : cnt2 32 | } 33 | docId : { 34 | t1 : cnt3, 35 | t2 : cnt4 36 | } 37 | . 38 | . 39 | . 40 | } 41 | . 42 | . 43 | . 44 | } 45 | ''' 46 | invertedIndex = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) 47 | 48 | stopwordsList = set() 49 | with open("stopwords.txt", 'r') as f: 50 | for line in f: 51 | line = line.strip() 52 | stopwordsList.add(line) 53 | 54 | # Regular Expression to remove Brackets and other meta characters from title 55 | regExp1 = re.compile(r"[~`!@#$%\-^*+{\[}\]\|\\<>/?,]", re.DOTALL) 56 | # Regular Expression for Categories 57 | catRegExp = r'\[\[category:(.*?)\]\]' 58 | # Regular Expression for Infobox 59 | infoRegExp = r'{{infobox(.*?)}}' 60 | # Regular Expression for References 61 | referenesRegExp = r'== ?references ?==(.*?)==' 62 | # Regular Expression to remove Infobox 63 | regExp2 = re.compile(infoRegExp, re.DOTALL) 64 | # Regular Expression to remove references 65 | regExp3 = re.compile(referenesRegExp, re.DOTALL) 66 | # Regular Expression to remove junk from text 67 | regExp4 = re.compile(r"[~`!@#$%\-^*+{\[}\]\|\\<>/?,]", re.DOTALL) 68 | 69 | 70 | def cleanText(text): 71 | # Regular Expression to remove {{cite **}} or {{vcite **}} 72 | reg = re.compile(r'{{v?cite(.*?)}}', re.DOTALL) 73 | text = reg.sub('', text) 74 | # Regular Expression to remove Punctuation 75 | reg = re.compile(r'[.,;_()/\"\'\=]', re.DOTALL) 76 | text = reg.sub(' ', text) 77 | # Regular Expression to remove [[file:]] 78 | reg = re.compile(r'\[\[file:(.*?)\]\]', re.DOTALL) 79 | text = reg.sub('', text) 80 | # Regular Expression to remove <..> tags from text 81 | reg = re.compile(r'<(.*?)>', re.DOTALL) 82 | text = reg.sub('', text) 83 | # Remove Non ASCII characters 84 | reg = re.compile(r'[^\x00-\x7F]+', re.DOTALL) 85 | text = reg.sub(' ', text) 86 | return text 87 | 88 | 89 | def addToIndex(wordList, docID, t): 90 | for word in wordList: 91 | word = word.strip() 92 | word = re.sub(r'[\ \.\-\:\&\$\!\*\+\%\,\@]+',"",word) 93 | if len(word) >= 3 and len(word) <= 500 and word not in stopwordsList: 94 | if word not in stemmingMap.keys(): 95 | stemmingMap[word] = ps.stem(word) 96 | word = stemmingMap[word] 97 | if word not in stopwordsList: 98 | if word in invertedIndex: 99 | if docID in invertedIndex[word]: 100 | if t in invertedIndex[word][docID]: 101 | invertedIndex[word][docID][t] += 1 102 | else: 103 | invertedIndex[word][docID][t] = 1 104 | else: 105 | invertedIndex[word][docID] = {t: 1} 106 | else: 107 | invertedIndex[word] = dict({docID: {t: 1}}) 108 | 109 | 110 | def processBuffer(text, docID, isTitle): 111 | global path_to_index 112 | text = text.lower() 113 | text = cleanText(text) 114 | if isTitle == True: 115 | regExp1.sub(' ', text) 116 | words = text.split() 117 | tokens = list() 118 | for word in words: 119 | if word not in stopwordsList: 120 | tokens.append(word.strip()) 121 | 122 | addToIndex(tokens, docID, "t") 123 | else: 124 | infobox = list() 125 | categories = list() 126 | external = list() 127 | references = list() 128 | 129 | externalLinkIndex = 0 130 | categoryIndex = len(text) 131 | 132 | categories = re.findall(catRegExp, text, flags=re.MULTILINE) 133 | 134 | lines = text.split('\n') 135 | flag = 1 136 | for i in range(len(lines)): 137 | if '{{infobox' in lines[i]: 138 | flag = 0 139 | temp = lines[i].split('{{infobox')[1:] 140 | infobox.extend(temp) 141 | while True: 142 | if(i >= len(lines)): 143 | break 144 | if '{{' in lines[i]: 145 | count = lines[i].count('{{') 146 | flag += count 147 | if '}}' in lines[i]: 148 | count = lines[i].count('}}') 149 | flag -= count 150 | if flag <= 0: 151 | break 152 | i += 1 153 | if(i < len(lines)): 154 | infobox.append(lines[i]) 155 | if flag <= 0: 156 | text = '\n'.join(lines[i+1:]) 157 | break 158 | 159 | try: 160 | externalLinkIndex = text.index('==external links==')+20 161 | except: 162 | pass 163 | 164 | if externalLinkIndex == 0: 165 | try: 166 | externalLinkIndex = text.index('== external links ==')+22 167 | except: 168 | pass 169 | 170 | try: 171 | categoryIndex = text.index('[[category:') 172 | except: 173 | pass 174 | 175 | if externalLinkIndex != 0: 176 | external = text[externalLinkIndex:categoryIndex] 177 | external = re.findall(r'\[(.*?)\]', external, flags=re.MULTILINE) 178 | 179 | references = re.findall(referenesRegExp, text, flags=re.DOTALL) 180 | 181 | if externalLinkIndex != 0: 182 | text = text[0:externalLinkIndex-20] 183 | 184 | text = regExp3.sub('', text) 185 | text = regExp4.sub(' ', text) 186 | words = text.split() 187 | addToIndex(words, docID, "b") 188 | 189 | categories = ' '.join(categories) 190 | categories = regExp4.sub(' ', categories) 191 | categories = categories.split() 192 | addToIndex(categories, docID, "c") 193 | 194 | external = ' '.join(external) 195 | external = regExp4.sub(' ', external) 196 | external = external.split() 197 | addToIndex(external, docID, "e") 198 | 199 | references = ' '.join(references) 200 | references = regExp4.sub(' ', references) 201 | references = references.split() 202 | addToIndex(references, docID, "r") 203 | 204 | for infoList in infobox: 205 | tokenList = list() 206 | tokenList = re.findall(r'\d+|[\w]+', infoList, re.DOTALL) 207 | tokenList = ' '.join(tokenList) 208 | tokenList = regExp4.sub(' ', tokenList) 209 | tokenList = tokenList.split() 210 | addToIndex(tokenList, docID, "i") 211 | 212 | if docID%fileLim == 0: 213 | f = open(path_to_index + "/" + str(docID) + ".txt", "w") 214 | for key, val in sorted(invertedIndex.items()): 215 | s = str(key)+"=" 216 | for k, v in sorted(val.items()): 217 | s += str(k) + ":" 218 | for k1, v1 in v.items(): 219 | s = s + str(k1) + str(v1) + "#" 220 | s = s[:-1]+"," 221 | f.write(s[:-1]+"\n") 222 | f.close() 223 | invertedIndex.clear() 224 | stemmingMap.clear() 225 | 226 | 227 | class WikiContentHandler(ContentHandler): 228 | def __init__(self): 229 | self.docID = 0 230 | self.isTitle = False 231 | self.flag = False 232 | self.title = "" 233 | self.buffer = "" 234 | 235 | def characters(self, content): 236 | self.buffer = self.buffer + content 237 | 238 | def startElement(self, element, attributes): 239 | if element == "title": 240 | self.buffer = "" 241 | self.isTitle = True 242 | self.flag = True 243 | if element == "page": 244 | self.docID += 1 245 | if element == "text": 246 | self.buffer = "" 247 | if element == "id" and self.flag: 248 | self.buffer = "" 249 | 250 | def endElement(self, element): 251 | if element == "title": 252 | processBuffer(self.buffer, self.docID, self.isTitle) 253 | self.isTitle = False 254 | self.title = self.buffer 255 | self.buffer = "" 256 | elif element == "text": 257 | processBuffer(self.buffer, self.docID, self.isTitle) 258 | self.buffer = "" 259 | elif element == "id" and self.flag == True: 260 | try: 261 | documentTitleMapping.write(str(self.docID)+"#"+self.title+"\n") 262 | except: 263 | documentTitleMapping.write( 264 | str(self.docID)+"#"+self.title.encode('utf-8')+"\n") 265 | self.flag = False 266 | self.buffer = "" 267 | 268 | 269 | def main(): 270 | parse(dumpFile, WikiContentHandler()) 271 | f = open(path_to_index + "/19567269.txt", "w") 272 | for key, val in sorted(invertedIndex.items()): 273 | s = str(key)+"=" 274 | for k, v in sorted(val.items()): 275 | s += str(k) + ":" 276 | for k1, v1 in v.items(): 277 | s = s + str(k1) + str(v1) + "#" 278 | s = s[:-1]+"," 279 | f.write(s[:-1]+"\n") 280 | f.close() 281 | invertedIndex.clear() 282 | stemmingMap.clear() 283 | 284 | 285 | if __name__ == "__main__": 286 | start = timeit.default_timer() 287 | main() 288 | stop = timeit.default_timer() 289 | print(stop - start) 290 | --------------------------------------------------------------------------------