├── index.sh
├── Readme.md
├── stopwords.txt
├── k-way-merge.py
├── search.py
└── myWikiIndexer.py


/index.sh:
--------------------------------------------------------------------------------
1 | python myWikiIndexer.py $1 $2
2 | python k-way-merge.py


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Wikipedia Search Engine
 2 | 
 3 | This is a search engine built on the full English corpus of wikipedia (~75GB)
 4 | 
 5 | ## Performance
 6 | 
 7 | ### For Queries of
 8 | 
 9 | 1. less than **3** words, time to fetch results is **< 1s**
10 | 2. between **3 and 7** words, time to fetch results is **Around 5s**
11 | 
12 | ## Code Files
13 | 
14 | 1. **myWikiIndexer.py** - File containing all functions related to XML parsing and text preprocessing, the code for indexing.
15 | 2. **k-way-merge.py** - File with functions related to k-way mergesort algorithm and creates secondary index.
16 | 3. **search.py** - Main file containing all the code for Query Processing
17 | 
18 | ## Execution of Code
19 | 
20 | ### Prerequisits
21 | 
22 | #### Required Directories
23 | 
24 | 1. **index** - Initial index gets created here
25 | 2. **finalIndex** - They get merged here and also stored secondary index in this directory
26 | 
27 | #### Required Files
28 | 
29 | 1. **stopwords.txt** - A txt file containing all the stop words in the current directory of the code
30 | 2. **wiki_dump.xml** - The XML file containing the full data of wikipedia
31 | 
32 | ### Execution
33 | 
34 | 1. Run **myWikiIndexer.py** with path to dump and folder to index as command line arguments.
35 | 2. Run **k-way-merge.py** - Will sort the index and create secondary index
36 | 3. Run **search.py** - An infinite loop runs expecting queries.
37 | 
38 | ### Types of Queries
39 | 
40 | 1. **Normal query** - Any sequence of words that doesn’t satisfy the above conditions is considered a normal query eg: “Sachin Tendulkar”
41 | 
42 | 2. **Field query** - Assuming that fields are small letters(b, i, c, t, r, e) followed by colon and the fields are space separated. eg: “body:sachin infobox:2003 category:sports”
43 | 


--------------------------------------------------------------------------------
/stopwords.txt:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | you're
 11 | you've
 12 | you'll
 13 | you'd
 14 | your
 15 | yours
 16 | yourself
 17 | yourselves
 18 | he
 19 | him
 20 | his
 21 | himself
 22 | she
 23 | she's
 24 | her
 25 | hers
 26 | herself
 27 | it
 28 | it's
 29 | its
 30 | itself
 31 | they
 32 | them
 33 | their
 34 | theirs
 35 | themselves
 36 | what
 37 | which
 38 | who
 39 | whom
 40 | this
 41 | that
 42 | that'll
 43 | these
 44 | those
 45 | am
 46 | is
 47 | are
 48 | was
 49 | were
 50 | be
 51 | been
 52 | being
 53 | have
 54 | has
 55 | had
 56 | having
 57 | do
 58 | does
 59 | did
 60 | doing
 61 | a
 62 | an
 63 | the
 64 | and
 65 | but
 66 | if
 67 | or
 68 | because
 69 | as
 70 | until
 71 | while
 72 | of
 73 | at
 74 | by
 75 | for
 76 | with
 77 | about
 78 | against
 79 | between
 80 | into
 81 | through
 82 | during
 83 | before
 84 | after
 85 | above
 86 | below
 87 | to
 88 | from
 89 | up
 90 | down
 91 | in
 92 | out
 93 | on
 94 | off
 95 | over
 96 | under
 97 | again
 98 | further
 99 | then
100 | once
101 | here
102 | there
103 | when
104 | where
105 | why
106 | how
107 | all
108 | any
109 | both
110 | each
111 | few
112 | more
113 | most
114 | other
115 | some
116 | such
117 | no
118 | nor
119 | not
120 | only
121 | own
122 | same
123 | so
124 | than
125 | too
126 | very
127 | s
128 | t
129 | can
130 | will
131 | just
132 | don
133 | don't
134 | should
135 | should've
136 | now
137 | d
138 | ll
139 | m
140 | o
141 | re
142 | ve
143 | y
144 | ain
145 | aren
146 | aren't
147 | couldn
148 | couldn't
149 | didn
150 | didn't
151 | doesn
152 | doesn't
153 | hadn
154 | hadn't
155 | hasn
156 | hasn't
157 | haven
158 | haven't
159 | isn
160 | isn't
161 | ma
162 | mightn
163 | mightn't
164 | mustn
165 | mustn't
166 | needn
167 | needn't
168 | shan
169 | shan't
170 | shouldn
171 | shouldn't
172 | wasn
173 | wasn't
174 | weren
175 | weren't
176 | won
177 | won't
178 | wouldn
179 | wouldn't
180 | 


--------------------------------------------------------------------------------
/k-way-merge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | import os
  5 | import timeit
  6 | from glob import glob
  7 | from collections import defaultdict
  8 | from heapq import heapify, heappush, heappop
  9 | 
 10 | 
 11 | splittedIndexFolder = './index'
 12 | mergedIndexFolder = './finalIndex'
 13 | 
 14 | numberOfMergedIndexfile = 0
 15 | chunkSize = 5000
 16 | secondaryIndex = defaultdict()
 17 | invertedIndex = defaultdict()
 18 | splittedFilePathList = glob(splittedIndexFolder + '/*')
 19 | numOfSplittedFiles = len(splittedFilePathList)
 20 | processedFiles = [0 for _ in range(numOfSplittedFiles)]
 21 | filePointers = dict()
 22 | currentRowofFile = dict()
 23 | kWayHeap = list()
 24 | termDict = dict()
 25 | total = 0
 26 | 
 27 | start = timeit.default_timer()
 28 | 
 29 | 
 30 | def writeIndextofile():
 31 |     global numberOfMergedIndexfile
 32 |     numberOfMergedIndexfile += 1
 33 |     fileName = mergedIndexFolder + '/index' \
 34 |         + str(numberOfMergedIndexfile) + '.txt'
 35 |     firstWord = True
 36 |     with open(fileName, 'w') as fp:
 37 |         for i in sorted(invertedIndex):
 38 |             if firstWord:
 39 |                 secondaryIndex[i] = numberOfMergedIndexfile
 40 |                 firstWord = False
 41 |             fp.write(str(i) + '=' + invertedIndex[i] + '\n')
 42 | 
 43 | 
 44 | def writeSecondaryIndex():
 45 |     fileName = mergedIndexFolder + '/secondaryIndex.txt'
 46 |     with open(fileName, 'w') as fp:
 47 |         for i in sorted(secondaryIndex):
 48 |             fp.write(str(i) + '\n')
 49 | 
 50 | 
 51 | def writePrimaryIndex():
 52 |     global numberOfMergedIndexfile
 53 |     numberOfMergedIndexfile += 1
 54 |     fileName = mergedIndexFolder + '/index' \
 55 |         + str(numberOfMergedIndexfile) + '.txt'
 56 |     firstWord = True
 57 |     with open(fileName, 'w') as fp:
 58 |         for i in sorted(invertedIndex):
 59 |             if firstWord:
 60 |                 secondaryIndex[i] = numberOfMergedIndexfile
 61 |                 firstWord = False
 62 |             fp.write(str(i) + '=' + invertedIndex[i] + '\n')
 63 | 
 64 | 
 65 | def kWayMerge():
 66 |     global total
 67 |     for i in range(numOfSplittedFiles):
 68 |         processedFiles[i] = 1
 69 |         try:
 70 |             filePointers[i] = open(splittedFilePathList[i], 'r')
 71 |         except:
 72 |             pass
 73 |         currentRowofFile[i] = filePointers[i].readline()
 74 |         termDict[i] = currentRowofFile[i].strip().split('=')
 75 |         if termDict[i][0] not in kWayHeap:
 76 |             heappush(kWayHeap, termDict[i][0])
 77 | 
 78 |     while True:
 79 |         if processedFiles.count(0) == numOfSplittedFiles:
 80 |             break
 81 |         else:
 82 |             total += 1
 83 |             word = heappop(kWayHeap)
 84 |             for i in range(numOfSplittedFiles):
 85 |                 if processedFiles[i] and termDict[i][0] == word:
 86 |                     if word not in invertedIndex:
 87 |                         invertedIndex[word] = termDict[i][1]
 88 |                     else:
 89 |                         invertedIndex[word] += ',' + termDict[i][1]
 90 | 
 91 |                     currentRowofFile[i] = \
 92 |                         filePointers[i].readline().strip()
 93 | 
 94 |                     if currentRowofFile[i]:
 95 |                         termDict[i] = currentRowofFile[i].split('=')
 96 |                         if termDict[i][0] not in kWayHeap:
 97 |                             heappush(kWayHeap, termDict[i][0])
 98 |                     else:
 99 |                         processedFiles[i] = 0
100 |                         filePointers[i].close()
101 |                         os.remove(splittedFilePathList[i])
102 |             if total >= chunkSize:
103 |                 total = 0
104 |                 writePrimaryIndex()
105 |                 invertedIndex.clear()
106 | 
107 | 
108 | kWayMerge()
109 | writePrimaryIndex()
110 | writeSecondaryIndex()
111 | stop = timeit.default_timer()
112 | print(stop - start)
113 | 


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import re
  6 | import timeit
  7 | from collections import defaultdict
  8 | from operator import itemgetter
  9 | from nltk.stem import PorterStemmer
 10 | from bisect import bisect
 11 | from math import log10
 12 | 
 13 | ps = PorterStemmer()
 14 | noDocs = 0
 15 | docToTitle = dict()
 16 | stopWords = set()
 17 | secondaryIndex = list()
 18 | invertedIndex = defaultdict(lambda : defaultdict(lambda : \
 19 |                             defaultdict(int)))
 20 | fieldDict = {
 21 |     'title': 't',
 22 |     'body': 'b',
 23 |     'infobox': 'i',
 24 |     'category': 'c',
 25 |     'ref': 'r',
 26 |     'ext': 'e',
 27 |     }
 28 | weight = {
 29 |     't': 500,
 30 |     'b': 1,
 31 |     'i': 50,
 32 |     'c': 50,
 33 |     'r': 50,
 34 |     'e': 50,
 35 | }
 36 | fields = ['title:', 'body:', 'infobox:', 'category:', 'ref:']
 37 | 
 38 | 
 39 | def readDocTitleMap():
 40 |     global docToTitle, noDocs
 41 |     with open('./docToTitle.txt', 'r') as f:
 42 |         for line in f:
 43 |             (docID, titleMap) = line.split('#')
 44 |             docToTitle[docID] = titleMap
 45 |             noDocs += 1
 46 | 
 47 | 
 48 | def readStopwords():
 49 |     global stopWords
 50 |     try:
 51 |         f = open('stopwords.txt', 'r')
 52 |         for line in f:
 53 |             stopWords.add(line.strip())
 54 |     except:
 55 |         print("Can't find stopwords.txt")
 56 |         sys.exit(1)
 57 | 
 58 | 
 59 | def readSecondaryIndex():
 60 |     global secondaryIndex
 61 |     try:
 62 |         f = open('finalIndex/secondaryIndex.txt', 'r')
 63 |         for line in f:
 64 |             secondaryIndex.append(line.split()[0])
 65 |     except:
 66 |         print("Can't find the secondary index file in 'finalIndex' Folder.")
 67 |         sys.exit(1)
 68 | 
 69 | 
 70 | def processIndex():
 71 |     fileName = '/index' + str() + '.txt'
 72 |     firstWord = True
 73 |     with open(fileName, 'w') as fp:
 74 |         for i in sorted(invertedIndex):
 75 |             if firstWord:
 76 |                 secondaryIndex[i] = 1
 77 |                 firstWord = False
 78 |             fp.write(str(i) + '=' + invertedIndex[i] + '\n')
 79 | 
 80 | 
 81 | def cleanText(text):
 82 | 
 83 |     # Regular Expression to remove {{cite **}} or {{vcite **}}
 84 | 
 85 |     reg = re.compile(r'{{v?cite(.*?)}}', re.DOTALL)
 86 |     text = reg.sub('', text)
 87 | 
 88 |     # Regular Expression to remove Punctuation
 89 | 
 90 |     reg = re.compile(r'[.,;_()"/\']', re.DOTALL)
 91 |     text = reg.sub(' ', text)
 92 | 
 93 |     # Regular Expression to remove [[file:]]
 94 | 
 95 |     reg = re.compile(r'\[\[file:(.*?)\]\]', re.DOTALL)
 96 |     text = reg.sub('', text)
 97 | 
 98 |     # Regular Expression to remove <..> tags from text
 99 | 
100 |     reg = re.compile(r'<(.*?)>', re.DOTALL)
101 |     text = reg.sub('', text)
102 | 
103 |     # Regular Expression to remove non ASCII char
104 | 
105 |     reg = re.compile(r'[^\x00-\x7F]+', re.DOTALL)
106 |     text = reg.sub(' ', text)
107 |     return text
108 | 
109 | 
110 | def getFileNumber(word):
111 |     position = bisect(secondaryIndex, word)
112 |     if position - 1 >= 0 and secondaryIndex[position - 1] == word:
113 |         if position - 1 != 0:
114 |             position -= 1
115 |         if position + 1 == len(secondaryIndex) \
116 |             and secondaryIndex[position] == word:
117 |             position += 1
118 |     return position
119 | 
120 | 
121 | def getPostingList(word):
122 |     position = getFileNumber(word)
123 |     primaryFile = 'finalIndex/index' + str(position) + '.txt'
124 |     file = open(primaryFile, 'r')
125 |     data = file.readlines()
126 |     low = 0
127 |     high = len(data)
128 |     mid = int()
129 |     while low <= high:
130 |         mid = int(low + (high - low) / 2)
131 |         cur = data[mid].split('=')[0]
132 |         if cur == word:
133 |             break
134 |         elif cur < word:
135 |             low = mid + 1
136 |         else:
137 |             high = mid - 1
138 |     return data[mid].split('=')[1].split(',')
139 | 
140 | 
141 | def printResult(lengthFreq):
142 |     lengthFreq = sorted(lengthFreq.items(), key=lambda item: item[1], reverse=True)[0:10]
143 |     for tup in lengthFreq:
144 |         (docId, _) = tup
145 |         print("=> ", docToTitle[docId], end='')
146 | 
147 | 
148 | def parseQuery(queryText, isFieldQuery):
149 |     wordRegEx = re.compile(r'[\ \.\-\:\&\$\!\*\+\%\,\@]+', re.DOTALL)
150 |     if isFieldQuery:
151 |         fieldQList = list()
152 |         queryList = queryText.split()
153 |         for word in queryList:
154 |             if ':' not in word:
155 |                 word = 'body:' + word
156 |             (cat, content) = word.split(':')
157 |             content = cleanText(content)
158 |             content = ps.stem(content)
159 |             content = wordRegEx.sub('', content)
160 |             if len(content) > 0 and content.isalnum and content \
161 |                 not in stopWords:
162 |                 fieldQList.append((content, fieldDict[cat]))
163 |         finalDict = defaultdict(int)
164 |         for (word, category) in fieldQList:
165 |             postingListAll = getPostingList(word)
166 |             postingList = [i for i in postingListAll if category in i]
167 |             if len(postingList) < 2:
168 |                 postingList = postingListAll
169 |             numDoc = len(postingList)
170 |             idf = log10(noDocs / numDoc)
171 |             for pl in postingList:
172 |                 docId, freqList = pl.split(":")
173 |                 categoryFreq = freqList.split("#")
174 |                 tf = 0
175 |                 for cf in categoryFreq:
176 |                     cat = cf[0]
177 |                     freq = int(cf[1:])
178 |                     tf += (freq * weight[cat])
179 |                 finalDict[docId] += float(log10(1 + tf)) * float(idf)
180 |     else:
181 | 
182 |         queryText = cleanText(queryText)
183 |         tokenList = queryText.split(' ')
184 |         tokenList = [wordRegEx.sub('', i) for i in tokenList]
185 |         finalTokens = list()
186 |         for tok in tokenList:
187 |             val = ps.stem(tok)
188 |             if len(val) > 0 and val.isalnum and val not in stopWords:
189 |                 finalTokens.append(val)
190 |         finalDict = defaultdict(int)
191 |         for word in finalTokens:
192 |             postingList = getPostingList(word)
193 |             numDoc = len(postingList)
194 |             idf = log10(noDocs / numDoc)
195 |             for pl in postingList:
196 |                 docId, freqList = pl.split(":")
197 |                 categoryFreq = freqList.split("#")
198 |                 tf = 0
199 |                 for cf in categoryFreq:
200 |                     cat = cf[0]
201 |                     freq = int(cf[1:])
202 |                     tf += (freq * weight[cat])
203 |                 finalDict[docId] += float(log10(1 + tf)) * float(idf)
204 |     printResult(finalDict)
205 | 
206 | 
207 | def search(path_to_index, query):
208 |     global fields
209 |     isFieldQuery = False
210 |     for f in fields:
211 |         if f in query:
212 |             isFieldQuery = True
213 |             break
214 |     parseQuery(query, isFieldQuery)
215 | 
216 | 
217 | def main():
218 |     path_to_index = './finalIndex/'
219 |     while True:
220 |         query = input('\nEnter Query: ')
221 |         print('+++++++++++++++++++++++++++++++++++')
222 |         start = timeit.default_timer()
223 |         search(path_to_index, query)
224 |         end = timeit.default_timer()
225 |         print('\nTook', end - start, 'sec\n')
226 |         print('+++++++++++++++++++++++++++++++++++')
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     print ('reading DoctitleMap')
231 |     readDocTitleMap()
232 |     print ('reading secondary Index')
233 |     readSecondaryIndex()
234 |     readStopwords()
235 |     try:
236 |         main()
237 |     except:
238 |         print ('''\n\nThank You..\n''')
239 | 


--------------------------------------------------------------------------------
/myWikiIndexer.py:
--------------------------------------------------------------------------------
  1 | # startting phase 2
  2 | import sys
  3 | import timeit
  4 | import re
  5 | import spacy
  6 | from xml.sax import parse
  7 | from xml.sax import ContentHandler
  8 | from collections import defaultdict
  9 | from nltk.stem import PorterStemmer
 10 | 
 11 | 
 12 | ps = PorterStemmer()
 13 | stemmingMap = dict()
 14 | fileLim = 25000
 15 | dumpFile = sys.argv[1]
 16 | path_to_index = sys.argv[2]
 17 | 
 18 | if len(sys.argv) != 3:
 19 |     print("Arguments invalid")
 20 |     print("Run using : bash index.sh <path_to_dump> <path_to_index_folder>")
 21 |     sys.exit(1)
 22 | 
 23 | documentTitleMapping = open("./docToTitle.txt", "w")
 24 | 
 25 | '''
 26 | Dictionary structure
 27 | {
 28 |     word : {
 29 |         docID :{
 30 |             t1 : cnt1,
 31 |             t2 : cnt2
 32 |         }
 33 |         docId : {
 34 |             t1 : cnt3,
 35 |             t2 : cnt4
 36 |         }
 37 |         .
 38 |         .
 39 |         .
 40 |     }
 41 |     .
 42 |     .
 43 |     .
 44 | }
 45 | '''
 46 | invertedIndex = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
 47 | 
 48 | stopwordsList = set()
 49 | with open("stopwords.txt", 'r') as f:
 50 |     for line in f:
 51 |         line = line.strip()
 52 |         stopwordsList.add(line)
 53 | 
 54 | # Regular Expression to remove Brackets and other meta characters from title
 55 | regExp1 = re.compile(r"[~`!@#$%\-^*+{\[}\]\|\\<>/?,]", re.DOTALL)
 56 | # Regular Expression for Categories
 57 | catRegExp = r'\[\[category:(.*?)\]\]'
 58 | # Regular Expression for Infobox
 59 | infoRegExp = r'{{infobox(.*?)}}'
 60 | # Regular Expression for References
 61 | referenesRegExp = r'== ?references ?==(.*?)=='
 62 | # Regular Expression to remove Infobox
 63 | regExp2 = re.compile(infoRegExp, re.DOTALL)
 64 | # Regular Expression to remove references
 65 | regExp3 = re.compile(referenesRegExp, re.DOTALL)
 66 | # Regular Expression to remove junk from text
 67 | regExp4 = re.compile(r"[~`!@#$%\-^*+{\[}\]\|\\<>/?,]", re.DOTALL)
 68 | 
 69 | 
 70 | def cleanText(text):
 71 |     # Regular Expression to remove {{cite **}} or {{vcite **}}
 72 |     reg = re.compile(r'{{v?cite(.*?)}}', re.DOTALL)
 73 |     text = reg.sub('', text)
 74 |     # Regular Expression to remove Punctuation
 75 |     reg = re.compile(r'[.,;_()/\"\'\=]', re.DOTALL)
 76 |     text = reg.sub(' ', text)
 77 |     # Regular Expression to remove [[file:]]
 78 |     reg = re.compile(r'\[\[file:(.*?)\]\]', re.DOTALL)
 79 |     text = reg.sub('', text)
 80 |     # Regular Expression to remove <..> tags from text
 81 |     reg = re.compile(r'<(.*?)>', re.DOTALL)
 82 |     text = reg.sub('', text)
 83 |     # Remove Non ASCII characters
 84 |     reg = re.compile(r'[^\x00-\x7F]+', re.DOTALL)
 85 |     text = reg.sub(' ', text)
 86 |     return text
 87 | 
 88 | 
 89 | def addToIndex(wordList, docID, t):
 90 |     for word in wordList:
 91 |         word = word.strip()
 92 |         word = re.sub(r'[\ \.\-\:\&\$\!\*\+\%\,\@]+',"",word)
 93 |         if len(word) >= 3 and len(word) <= 500 and word not in stopwordsList:
 94 |             if word not in stemmingMap.keys():
 95 |                 stemmingMap[word] = ps.stem(word)
 96 |             word = stemmingMap[word]
 97 |             if word not in stopwordsList:
 98 |                 if word in invertedIndex:
 99 |                     if docID in invertedIndex[word]:
100 |                         if t in invertedIndex[word][docID]:
101 |                             invertedIndex[word][docID][t] += 1
102 |                         else:
103 |                             invertedIndex[word][docID][t] = 1
104 |                     else:
105 |                         invertedIndex[word][docID] = {t: 1}
106 |                 else:
107 |                     invertedIndex[word] = dict({docID: {t: 1}})
108 | 
109 | 
110 | def processBuffer(text, docID, isTitle):
111 |     global path_to_index
112 |     text = text.lower()
113 |     text = cleanText(text)
114 |     if isTitle == True:
115 |         regExp1.sub(' ', text)
116 |         words = text.split()
117 |         tokens = list()
118 |         for word in words:
119 |             if word not in stopwordsList:
120 |                 tokens.append(word.strip())
121 | 
122 |         addToIndex(tokens, docID, "t")
123 |     else:
124 |         infobox = list()
125 |         categories = list()
126 |         external = list()
127 |         references = list()
128 | 
129 |         externalLinkIndex = 0
130 |         categoryIndex = len(text)
131 | 
132 |         categories = re.findall(catRegExp, text, flags=re.MULTILINE)
133 | 
134 |         lines = text.split('\n')
135 |         flag = 1
136 |         for i in range(len(lines)):
137 |             if '{{infobox' in lines[i]:
138 |                 flag = 0
139 |                 temp = lines[i].split('{{infobox')[1:]
140 |                 infobox.extend(temp)
141 |                 while True:
142 |                     if(i >= len(lines)):
143 |                         break
144 |                     if '{{' in lines[i]:
145 |                         count = lines[i].count('{{')
146 |                         flag += count
147 |                     if '}}' in lines[i]:
148 |                         count = lines[i].count('}}')
149 |                         flag -= count
150 |                     if flag <= 0:
151 |                         break
152 |                     i += 1
153 |                     if(i < len(lines)):
154 |                         infobox.append(lines[i])
155 |             if flag <= 0:
156 |                 text = '\n'.join(lines[i+1:])
157 |                 break
158 | 
159 |         try:
160 |             externalLinkIndex = text.index('==external links==')+20
161 |         except:
162 |             pass
163 | 
164 |         if externalLinkIndex == 0:
165 |             try:
166 |                 externalLinkIndex = text.index('== external links ==')+22
167 |             except:
168 |                 pass
169 | 
170 |         try:
171 |             categoryIndex = text.index('[[category:')
172 |         except:
173 |             pass
174 | 
175 |         if externalLinkIndex != 0:
176 |             external = text[externalLinkIndex:categoryIndex]
177 |             external = re.findall(r'\[(.*?)\]', external, flags=re.MULTILINE)
178 | 
179 |         references = re.findall(referenesRegExp, text, flags=re.DOTALL)
180 | 
181 |         if externalLinkIndex != 0:
182 |             text = text[0:externalLinkIndex-20]
183 | 
184 |         text = regExp3.sub('', text)
185 |         text = regExp4.sub(' ', text)
186 |         words = text.split()
187 |         addToIndex(words, docID, "b")
188 | 
189 |         categories = ' '.join(categories)
190 |         categories = regExp4.sub(' ', categories)
191 |         categories = categories.split()
192 |         addToIndex(categories, docID, "c")
193 | 
194 |         external = ' '.join(external)
195 |         external = regExp4.sub(' ', external)
196 |         external = external.split()
197 |         addToIndex(external, docID, "e")
198 | 
199 |         references = ' '.join(references)
200 |         references = regExp4.sub(' ', references)
201 |         references = references.split()
202 |         addToIndex(references, docID, "r")
203 | 
204 |         for infoList in infobox:
205 |             tokenList = list()
206 |             tokenList = re.findall(r'\d+|[\w]+', infoList, re.DOTALL)
207 |             tokenList = ' '.join(tokenList)
208 |             tokenList = regExp4.sub(' ', tokenList)
209 |             tokenList = tokenList.split()
210 |             addToIndex(tokenList, docID, "i")
211 | 
212 |         if docID%fileLim == 0:
213 |             f = open(path_to_index + "/" + str(docID) + ".txt", "w")
214 |             for key, val in sorted(invertedIndex.items()):
215 |                 s = str(key)+"="
216 |                 for k, v in sorted(val.items()):
217 |                     s += str(k) + ":"
218 |                     for k1, v1 in v.items():
219 |                         s = s + str(k1) + str(v1) + "#"
220 |                     s = s[:-1]+","
221 |                 f.write(s[:-1]+"\n")
222 |             f.close()
223 |             invertedIndex.clear()
224 |             stemmingMap.clear()
225 | 
226 | 
227 | class WikiContentHandler(ContentHandler):
228 |     def __init__(self):
229 |         self.docID = 0
230 |         self.isTitle = False
231 |         self.flag = False
232 |         self.title = ""
233 |         self.buffer = ""
234 | 
235 |     def characters(self, content):
236 |         self.buffer = self.buffer + content
237 | 
238 |     def startElement(self, element, attributes):
239 |         if element == "title":
240 |             self.buffer = ""
241 |             self.isTitle = True
242 |             self.flag = True
243 |         if element == "page":
244 |             self.docID += 1
245 |         if element == "text":
246 |             self.buffer = ""
247 |         if element == "id" and self.flag:
248 |             self.buffer = ""
249 | 
250 |     def endElement(self, element):
251 |         if element == "title":
252 |             processBuffer(self.buffer, self.docID, self.isTitle)
253 |             self.isTitle = False
254 |             self.title = self.buffer
255 |             self.buffer = ""
256 |         elif element == "text":
257 |             processBuffer(self.buffer, self.docID, self.isTitle)
258 |             self.buffer = ""
259 |         elif element == "id" and self.flag == True:
260 |             try:
261 |                 documentTitleMapping.write(str(self.docID)+"#"+self.title+"\n")
262 |             except:
263 |                 documentTitleMapping.write(
264 |                     str(self.docID)+"#"+self.title.encode('utf-8')+"\n")
265 |             self.flag = False
266 |             self.buffer = ""
267 | 
268 | 
269 | def main():
270 |     parse(dumpFile, WikiContentHandler())
271 |     f = open(path_to_index + "/19567269.txt", "w")
272 |     for key, val in sorted(invertedIndex.items()):
273 |         s = str(key)+"="
274 |         for k, v in sorted(val.items()):
275 |             s += str(k) + ":"
276 |             for k1, v1 in v.items():
277 |                 s = s + str(k1) + str(v1) + "#"
278 |             s = s[:-1]+","
279 |         f.write(s[:-1]+"\n")
280 |     f.close()
281 |     invertedIndex.clear()
282 |     stemmingMap.clear()
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     start = timeit.default_timer()
287 |     main()
288 |     stop = timeit.default_timer()
289 |     print(stop - start)
290 | 


--------------------------------------------------------------------------------