├── .gitignore
├── .gitattributes
├── Pipfile
├── Chunker.py
├── README.md
└── main.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | .idea
3 | Pipfile.lock
4 | progress.log
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 | 
6 | [packages]
7 | spacy = "*"
8 | psutil = "*"
9 | 


--------------------------------------------------------------------------------
/Chunker.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from os import stat
 3 | from pathlib import Path
 4 | 
 5 | from typing import Generator, Union
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class Chunker:
11 |     """ Chunker that can chunk a file into byte ranges which can then be retrieved as a list of encoded lines. """
12 |     def __init__(self, fin: Union[str, bytes, Path], batch_size: int = 1000, encoding: str = 'utf-8'):
13 |         """
14 |         :param fin: filename to chunk
15 |         :param batch_size: approximate size of each chunk (in kilobytes)
16 |         :param encoding: encoding of the input file. Will be used when retrieving the encoded batch of lines
17 |         """
18 |         self.batch_size = int(batch_size * 1e3)
19 |         self.encoding = encoding
20 |         self.pfin = Path(fin).resolve()
21 | 
22 |         logger.info(f"Chunking with a batch size of {batch_size:,} kilobytes.")
23 | 
24 |     def chunkify(self) -> Generator:
25 |         """ Chunks a file into sequential byte ranges of approximately the same size as defined in the constructor.
26 |         The size of each chunk is not exactly the same because if a chunk ends on an incomplete line, the remainder
27 |         of the line will also be read and included in the chunk.
28 | 
29 |         :returns a generator that yields tuples of two integers: the starting byte of the chunk and its size
30 |         """
31 |         file_end = stat(self.pfin).st_size
32 | 
33 |         # If the file is smaller than or equal to the buffer size, we can get it all in one batch
34 |         if file_end <= self.batch_size:
35 |             yield 0, file_end
36 |         else:
37 |             with self.pfin.open('rb') as fhin:
38 |                 prev_pos = 0
39 |                 while prev_pos < file_end:
40 |                     pos = prev_pos + self.batch_size
41 |                     fhin.seek(pos)
42 |                     fhin.readline()
43 |                     pos = fhin.tell()
44 |                     yield prev_pos, pos - prev_pos
45 |                     prev_pos = pos
46 | 
47 |     def get_batch(self, chunk_start: int, chunk_size: int, rm_newlines: bool = True) -> Generator:
48 |         """ Retrieves a chunk, given a starting byte and chunk size, as a batch of encoded lines through a generator.
49 | 
50 |         :param chunk_start: the starting byte position of the requested chunk
51 |         :param chunk_size: the size of the requested chunk
52 |         :param rm_newlines: whether to remove the newlines at the end of each line (rstrip)
53 |         :returns a generator that yields each encoded line in the batch
54 |         """
55 |         with open(self.pfin, 'rb') as f:
56 |             f.seek(chunk_start)
57 |             chunk = f.read(chunk_size)
58 | 
59 |         if rm_newlines:
60 |             return (s.decode(self.encoding).rstrip() for s in chunk.split(b'\n') if s)
61 |         else:
62 |             return (s.decode(self.encoding) for s in chunk.split(b'\n') if s)
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spacy-extreme
 2 | An example of how to use spaCy for extremely large files without running into memory issues
 3 | 
 4 | ## Memory issues with spaCy
 5 | **EDIT**: the memory issues with running the spaCy pipeline were fixed in [#4486](https://github.com/explosion/spaCy/pull/4486).
 6 | I will keep this repo online as an educational code snippet of how to efficiently chunk your data, though. The rest of
 7 | this section can be ignored.
 8 | 
 9 | SpaCy is a popular, powerful NLP tool that can process a text and get almost any information out of it that you could need. 
10 | Unfortunately I started running into issues when multiprocessing a single file of 30GB+: the memory usage kept growing. 
11 | Even with [the simplest base case](https://github.com/explosion/spaCy/issues/3618) the issue persists. 
12 | A 'bug fix' is not available, because it is not clear where the memory is leaking. One would expect that the issue lies 
13 | in spaCy itself, but that would imply that reloading a spaCy instance should free that memory.
14 | But that is [not the case](https://github.com/explosion/spaCy/issues/3618#issuecomment-485832596).
15 | It is hard, then, to find a fix - because it is unclear where to start looking.
16 | 
17 | Because of that, I figured that there must be another way.
18 | The solution lies in the `multiprocessing` library, and more specifically in one of the parameters for 
19 | [`Pool`](https://docs.python.org/3.7/library/multiprocessing.html#multiprocessing.pool.Pool).
20 | `maxtasksperchild` is a parameter that ensures that a single child process will execute only n tasks. After that, it will
21 | be killed, its memory freed, and replaced by a new process.
22 | That is exactly what we need! 
23 | The memory grows because more and more data is read by a process. We want to limit the number of batches that a process
24 | can process so that its memory usage is being kept in check.
25 | 
26 | ## Parsing huge files: how to be lenient on memory?
27 | Another issue that you may be faced with, is processing an enormous file and distributing it over child processes,
28 | without running into memory issues.
29 | We want to process these large files in batches, which will make processing more efficient.
30 | These batches cannot be too small because then the workers will consume the batches too quickly,
31 | causing only a few workers to be actively processing batches at a time.
32 | In the example code, you will find a
33 | [`Chunker`](https://github.com/BramVanroy/spacy-extreme/blob/master/main.py#L47-L68) class.
34 | This chunker will retrieve *file pointers* from a file. These are integers representing a position in a file, you can
35 | think of it as the cursor position, in bytes.
36 | In every step, the cursor moves forward `batch_size` bytes, and return the position of the cursor.
37 | When the child process retrieves a cursor position, it will look it up in the file, and get a `batch_size`d chunk.
38 | This chunk can then be processed.
39 | As may be clear, the actual file contents are *not* retrieved by the first step in the reader process.
40 | We do not want to share these huge chunks of data between processes, but the file pointer is just an integer; easily and quickly shared.
41 | 
42 | ## Usage
43 | 
44 | ```bash
45 | usage: main.py [-h] [-b BATCH_SIZE] [--max-length MAX_LENGTH]
46 |                [-m MAX_TASKS_PER_CHILD] [--min-length MIN_LENGTH]
47 |                [-n N_WORKERS] [--spacy-model SPACY_MODEL]
48 |                fin
49 | 
50 | Parse HUGE text files with spaCy in parallel without running into memory
51 | issues.
52 | 
53 | positional arguments:
54 |   fin                   input file.
55 | 
56 | optional arguments:
57 |   -h, --help            show this help message and exit
58 |   -b BATCH_SIZE, --batch-size BATCH_SIZE
59 |                         batch size (in bytes). (default: 1048576)
60 |   --max-length MAX_LENGTH
61 |                         sentences with more than 'max_length' will not be
62 |                         included in the output. (default: None)
63 |   -m MAX_TASKS_PER_CHILD, --max-tasks-per-child MAX_TASKS_PER_CHILD
64 |                         max number of batches that a child process can process
65 |                         before it is killed and replaced. Use this when
66 |                         running into memory issues. (default: 5)
67 |   --min-length MIN_LENGTH
68 |                         sentences with less than 'min_length' will not be
69 |                         included in the output. (default: None)
70 |   -n N_WORKERS, --n-workers N_WORKERS
71 |                         number of workers to use (default depends on your
72 |                         current system).
73 |   --spacy-model SPACY_MODEL
74 |                         spaCy model to use (must be installed). (default:
75 |                         en_core_web_sm)
76 | ```
77 | 
78 | ## Best settings
79 | It is hard to tell what the best settings are for a given combination of hardware and the data.
80 | On a machine with 384GB of memory and 48 cores, I ran the script with the following settings.
81 | Memory consumption never exceeded 78%.
82 | 
83 | - `-n 24`: using 24 cores. 
84 | - `--spacy-model en_core_web_lg`: the largest Englsih spaCy model
85 | - `-b 50000000`: a batch size of 50 MB (50,000,000 bytes). With my data, one such batch was roughly equivalent to 400k sentences
86 | - `-m 5`: replace a process after having processed 5 batches. In total each process processes 2M sentences before being replaced
87 | 
88 | If you do not have a lot of memory available, you will want to set `--max-tasks-per-child` (`-m`) to 1 so that an active process is replaced after each batch.
89 | In such case, ensure that your batch size is not too small (e.g. not less than 100kB) to maximize efficiency. 
90 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | from math import inf
  4 | from multiprocessing import Manager, Pool, Process
  5 | from os import cpu_count
  6 | from pathlib import Path
  7 | 
  8 | import psutil
  9 | import spacy
 10 | 
 11 | from Chunker import Chunker
 12 | 
 13 | logging.basicConfig(datefmt='%d-%b %H:%M:%S',
 14 |                     format='%(asctime)s - [%(levelname)s]: %(message)s',
 15 |                     level=logging.INFO,
 16 |                     handlers=[
 17 |                         logging.FileHandler('progress.log'),
 18 |                         logging.StreamHandler()
 19 |                     ])
 20 | 
 21 | DEFAULT_WORKERS = (cpu_count() - 2) or 1
 22 | 
 23 | """ Processes a single, huge text file with spaCy, without running into memory issues 
 24 |     IF the right parameters are chosen.
 25 |     
 26 |     Important parameters:
 27 |         - -b, --batch-size: the batch size (in bytes) to process at the same time. A larger batch, will mean that
 28 |             every task (in the max-tasks-per-child) will use more memory. You need to find a good balance between
 29 |             batch-size and max-tasks-per-child.
 30 |         - -m, --max-tasks-per-child: the number of batches to process before a child process is killed and replaced
 31 |             this will effectively free the memory used by that child process. If you are very low on memory, 
 32 |             set this to 1, meaning that each process will only process one batch before being replaced.
 33 |         - -n, --n-workers: the number of child processes to spawn that will process the batches. It is important
 34 |             to know that the readers and writers are working in their own subprocesses, so don't use all cores 
 35 |             for n-workers. Also, the more cores you put to work simultaneously, the more memory you will be using.
 36 |             On top of that, if your batch-size is too small, the reader will not be fast enough to feed all the workers.
 37 |             So, again, you need to find a good trade-off focused on the batch-size.
 38 |         - --spacy-model: it makes sense that if you use a large spaCy model, you will consume more memory. 
 39 | 
 40 |     Reading input happens in chunks. The byte file pointers of each chunk are passed to the child processes,
 41 |     leaving them in charge of actually getting the contents from the file.
 42 |     This means, though, that the order of the sentence in the input file is NOT preserved.
 43 |     
 44 |     You can use this file as a template, and only change the process_batch method to your liking.
 45 |     That's where the actual values from spaCy are retrieved and processed.
 46 | """
 47 | 
 48 | 
 49 | class Representator:
 50 |     def __init__(self, max_length=None, min_length=None, spacy_model='en_core_web_sm'):
 51 |         self.max_length = max_length if max_length else inf
 52 |         self.min_length = min_length if min_length else 0
 53 | 
 54 |         self.nlp = spacy.load(spacy_model, disable=['ner', 'textcat'])
 55 |         self.nlp.add_pipe(self._prevent_sbd, name='prevent-sbd', before='parser')
 56 | 
 57 |         self.results_q = None
 58 |         self.work_q = None
 59 | 
 60 |         self.chunker = None
 61 | 
 62 |     def process(self, pfin, n_workers, max_tasks_per_child):
 63 |         logging.info(f"Started processing {pfin.name} with {n_workers} workers.")
 64 |         if max_tasks_per_child:
 65 |             logging.info(f"Max. {max_tasks_per_child} tasks per child process before replacement.")
 66 | 
 67 |         start_time = datetime.datetime.now()
 68 | 
 69 |         total_n_sentences = 0
 70 |         total_n_tokens = 0
 71 |         with Manager() as manager:
 72 |             self.results_q = manager.Queue(maxsize=max(n_workers * 100, 256))
 73 |             self.work_q = manager.Queue(maxsize=n_workers * 2)
 74 | 
 75 |             # Create a reader and a writer process
 76 |             reader_proc = Process(target=self.reader)
 77 |             # The reader starts filling up the work_q
 78 |             reader_proc.start()
 79 |             writer_proc = Process(target=self.writer, args=(pfin,))
 80 |             writer_proc.start()
 81 | 
 82 |             with Pool(n_workers, maxtasksperchild=max_tasks_per_child) as pool:
 83 |                 worker_jobs = []
 84 |                 logging.info('Chunking...')
 85 |                 while True:
 86 |                     # Get work from the working queue
 87 |                     work = self.work_q.get()
 88 |                     if work == 'done':
 89 |                         break
 90 | 
 91 |                     chunk_start, chunk_size = work
 92 |                     # Apply work to workers
 93 |                     job = pool.apply_async(self.process_batch, (chunk_start, chunk_size))
 94 |                     worker_jobs.append(job)
 95 |                 logging.info('Done chunking...')
 96 | 
 97 |                 # After the queue is 'done', the reader can close
 98 |                 reader_proc.join()
 99 |                 reader_proc.terminate()
100 | 
101 |                 # When a worker has finished its job, get its information back
102 |                 for job_idx, job in enumerate(worker_jobs, 1):
103 |                     n_sentences, n_tokens = job.get()
104 | 
105 |                     total_n_sentences += n_sentences
106 |                     total_n_tokens += n_tokens
107 | 
108 |                     # Log some progress info
109 |                     if job_idx == 1 or job_idx % n_workers == 0:
110 |                         time_since_start = (datetime.datetime.now() - start_time)
111 |                         sents_perf = total_n_sentences // time_since_start.total_seconds()
112 |                         time_since_start = self._format_time(time_since_start)
113 |                         logging.info(f"Processed batch #{job_idx:,}: {n_sentences:,} sents ({sents_perf:,.0f} sents/s)."
114 |                                      f" Mem. use: {psutil.virtual_memory().percent}%. Running for {time_since_start}")
115 | 
116 |                 # Notify the writer that we're done
117 |                 self.results_q.put('done')
118 | 
119 |             writer_proc.join()
120 |             writer_proc.terminate()
121 | 
122 |         # Log some info
123 |         running_time = (datetime.datetime.now() - start_time)
124 |         sents_perf = total_n_sentences // running_time.total_seconds()
125 |         running_time = self._format_time(running_time)
126 |         logging.info(f"Done processing in {running_time} ({sents_perf:,.0f} sentences/s)."
127 |                      f" Processed {total_n_sentences:,.0f} sentences and {total_n_tokens:,.0f} tokens.")
128 | 
129 |     def process_batch(self, chunk_start, chunk_size):
130 |         batch = self.chunker.get_batch(chunk_start, chunk_size)
131 | 
132 |         # Parse text with spaCy
133 |         docs = self.nlp.pipe(batch)
134 |         # Chop into sentences
135 |         spacy_sents = [sent for doc in docs for sent in doc.sents]
136 |         del docs
137 |         # Filter too long or too short sentences
138 |         spacy_sents = [sent for sent in spacy_sents if self.min_length <= len(sent) <= self.max_length]
139 |         n_sentences = len(spacy_sents)
140 |         n_tokens = 0
141 | 
142 |         # Get some value from spaCy that we want to write to files
143 |         # Here we just get the tokens, but you can change it to whatever you want
144 |         sents_tok = []
145 |         for sent in spacy_sents:
146 |             n_tokens += len(sent)
147 |             sents_tok.append(' '.join([token.text for token in sent]))
148 | 
149 |         # Pass results to queue, so they can be written to file by the writer
150 |         self.results_q.put(sents_tok)
151 | 
152 |         # Return the number of sentences and number of tokens, just to keep track
153 |         # Also return first and last line. These are likely to be 'broken' sentences
154 |         # due to chunking. After processing everything, we will process these 'partial
155 |         # sentences' separately in the main process.
156 |         return n_sentences, n_tokens
157 | 
158 |     @staticmethod
159 |     def _prevent_sbd(doc):
160 |         # If you already have one sentence per line in your file
161 |         # you may wish to disable sentence segmentation with this function,
162 |         # which is added to the nlp pipe in the constructor
163 |         for token in doc:
164 |             token.is_sent_start = False
165 |         return doc
166 | 
167 |     @staticmethod
168 |     def _format_time(delta):
169 |         hours, remainder = divmod(delta.total_seconds(), 3600)
170 |         minutes, seconds = divmod(remainder, 60)
171 | 
172 |         return f"{hours:02,.0f}:{minutes:02.0f}:{seconds:02.0f}"
173 | 
174 |     # I/O methods
175 |     def writer(self, pfin):
176 |         with open(pfin.with_suffix('.out'), 'w', encoding='utf-8') as fhout:
177 |             while True:
178 |                 m = self.results_q.get()
179 |                 if m == 'done':
180 |                     break
181 | 
182 |                 fhout.write('\n'.join(m) + '\n')
183 |                 fhout.flush()
184 | 
185 |     def reader(self):
186 |         for chunk_tuple in self.chunker.chunkify():
187 |             self.work_q.put(chunk_tuple)
188 | 
189 |         self.work_q.put('done')
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     import argparse
194 | 
195 |     parser = argparse.ArgumentParser(description='Parse HUGE text files with spaCy in parallel without running'
196 |                                                  ' into memory issues.',
197 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
198 |     parser.add_argument('fin', help='input file.')
199 |     parser.add_argument('-b', '--batch-size', type=int, default=1000,
200 |                         help='batch size (in kilobytes).')
201 |     parser.add_argument('--max-length', type=int, default=None,
202 |                         help="sentences with more than 'max_length' will not be included in the output.")
203 |     parser.add_argument('-m', '--max-tasks-per-child', type=int, default=5,
204 |                         help="max number of batches that a child process can process before it is killed and replaced."
205 |                              " Use this when running into memory issues.")
206 |     parser.add_argument('--min-length', type=int, default=None,
207 |                         help="sentences with less than 'min_length' will not be included in the output.")
208 |     parser.add_argument('-n', '--n-workers', type=int, default=DEFAULT_WORKERS,
209 |                         help=f"number of workers to use (default depends on your current system).")
210 |     parser.add_argument('--spacy-model', default='en_core_web_sm',
211 |                         help='spaCy model to use (must be installed).')
212 |     args = parser.parse_args()
213 | 
214 |     args = vars(args)
215 |     file_in = Path(args.pop('fin')).resolve()
216 |     workers = args.pop('n_workers')
217 |     b_size = args.pop('batch_size')
218 |     max_tasks = args.pop('max_tasks_per_child')
219 | 
220 |     representer = Representator(**args)
221 |     representer.chunker = Chunker(file_in, b_size)
222 |     representer.process(file_in, workers, max_tasks)
223 | 


--------------------------------------------------------------------------------