├── .gitignore ├── .gitattributes ├── Pipfile ├── Chunker.py ├── README.md └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | .idea 3 | Pipfile.lock 4 | progress.log 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [packages] 7 | spacy = "*" 8 | psutil = "*" 9 | -------------------------------------------------------------------------------- /Chunker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os import stat 3 | from pathlib import Path 4 | 5 | from typing import Generator, Union 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class Chunker: 11 | """ Chunker that can chunk a file into byte ranges which can then be retrieved as a list of encoded lines. """ 12 | def __init__(self, fin: Union[str, bytes, Path], batch_size: int = 1000, encoding: str = 'utf-8'): 13 | """ 14 | :param fin: filename to chunk 15 | :param batch_size: approximate size of each chunk (in kilobytes) 16 | :param encoding: encoding of the input file. Will be used when retrieving the encoded batch of lines 17 | """ 18 | self.batch_size = int(batch_size * 1e3) 19 | self.encoding = encoding 20 | self.pfin = Path(fin).resolve() 21 | 22 | logger.info(f"Chunking with a batch size of {batch_size:,} kilobytes.") 23 | 24 | def chunkify(self) -> Generator: 25 | """ Chunks a file into sequential byte ranges of approximately the same size as defined in the constructor. 26 | The size of each chunk is not exactly the same because if a chunk ends on an incomplete line, the remainder 27 | of the line will also be read and included in the chunk. 28 | 29 | :returns a generator that yields tuples of two integers: the starting byte of the chunk and its size 30 | """ 31 | file_end = stat(self.pfin).st_size 32 | 33 | # If the file is smaller than or equal to the buffer size, we can get it all in one batch 34 | if file_end <= self.batch_size: 35 | yield 0, file_end 36 | else: 37 | with self.pfin.open('rb') as fhin: 38 | prev_pos = 0 39 | while prev_pos < file_end: 40 | pos = prev_pos + self.batch_size 41 | fhin.seek(pos) 42 | fhin.readline() 43 | pos = fhin.tell() 44 | yield prev_pos, pos - prev_pos 45 | prev_pos = pos 46 | 47 | def get_batch(self, chunk_start: int, chunk_size: int, rm_newlines: bool = True) -> Generator: 48 | """ Retrieves a chunk, given a starting byte and chunk size, as a batch of encoded lines through a generator. 49 | 50 | :param chunk_start: the starting byte position of the requested chunk 51 | :param chunk_size: the size of the requested chunk 52 | :param rm_newlines: whether to remove the newlines at the end of each line (rstrip) 53 | :returns a generator that yields each encoded line in the batch 54 | """ 55 | with open(self.pfin, 'rb') as f: 56 | f.seek(chunk_start) 57 | chunk = f.read(chunk_size) 58 | 59 | if rm_newlines: 60 | return (s.decode(self.encoding).rstrip() for s in chunk.split(b'\n') if s) 61 | else: 62 | return (s.decode(self.encoding) for s in chunk.split(b'\n') if s) 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spacy-extreme 2 | An example of how to use spaCy for extremely large files without running into memory issues 3 | 4 | ## Memory issues with spaCy 5 | **EDIT**: the memory issues with running the spaCy pipeline were fixed in [#4486](https://github.com/explosion/spaCy/pull/4486). 6 | I will keep this repo online as an educational code snippet of how to efficiently chunk your data, though. The rest of 7 | this section can be ignored. 8 | 9 | SpaCy is a popular, powerful NLP tool that can process a text and get almost any information out of it that you could need. 10 | Unfortunately I started running into issues when multiprocessing a single file of 30GB+: the memory usage kept growing. 11 | Even with [the simplest base case](https://github.com/explosion/spaCy/issues/3618) the issue persists. 12 | A 'bug fix' is not available, because it is not clear where the memory is leaking. One would expect that the issue lies 13 | in spaCy itself, but that would imply that reloading a spaCy instance should free that memory. 14 | But that is [not the case](https://github.com/explosion/spaCy/issues/3618#issuecomment-485832596). 15 | It is hard, then, to find a fix - because it is unclear where to start looking. 16 | 17 | Because of that, I figured that there must be another way. 18 | The solution lies in the `multiprocessing` library, and more specifically in one of the parameters for 19 | [`Pool`](https://docs.python.org/3.7/library/multiprocessing.html#multiprocessing.pool.Pool). 20 | `maxtasksperchild` is a parameter that ensures that a single child process will execute only n tasks. After that, it will 21 | be killed, its memory freed, and replaced by a new process. 22 | That is exactly what we need! 23 | The memory grows because more and more data is read by a process. We want to limit the number of batches that a process 24 | can process so that its memory usage is being kept in check. 25 | 26 | ## Parsing huge files: how to be lenient on memory? 27 | Another issue that you may be faced with, is processing an enormous file and distributing it over child processes, 28 | without running into memory issues. 29 | We want to process these large files in batches, which will make processing more efficient. 30 | These batches cannot be too small because then the workers will consume the batches too quickly, 31 | causing only a few workers to be actively processing batches at a time. 32 | In the example code, you will find a 33 | [`Chunker`](https://github.com/BramVanroy/spacy-extreme/blob/master/main.py#L47-L68) class. 34 | This chunker will retrieve *file pointers* from a file. These are integers representing a position in a file, you can 35 | think of it as the cursor position, in bytes. 36 | In every step, the cursor moves forward `batch_size` bytes, and return the position of the cursor. 37 | When the child process retrieves a cursor position, it will look it up in the file, and get a `batch_size`d chunk. 38 | This chunk can then be processed. 39 | As may be clear, the actual file contents are *not* retrieved by the first step in the reader process. 40 | We do not want to share these huge chunks of data between processes, but the file pointer is just an integer; easily and quickly shared. 41 | 42 | ## Usage 43 | 44 | ```bash 45 | usage: main.py [-h] [-b BATCH_SIZE] [--max-length MAX_LENGTH] 46 | [-m MAX_TASKS_PER_CHILD] [--min-length MIN_LENGTH] 47 | [-n N_WORKERS] [--spacy-model SPACY_MODEL] 48 | fin 49 | 50 | Parse HUGE text files with spaCy in parallel without running into memory 51 | issues. 52 | 53 | positional arguments: 54 | fin input file. 55 | 56 | optional arguments: 57 | -h, --help show this help message and exit 58 | -b BATCH_SIZE, --batch-size BATCH_SIZE 59 | batch size (in bytes). (default: 1048576) 60 | --max-length MAX_LENGTH 61 | sentences with more than 'max_length' will not be 62 | included in the output. (default: None) 63 | -m MAX_TASKS_PER_CHILD, --max-tasks-per-child MAX_TASKS_PER_CHILD 64 | max number of batches that a child process can process 65 | before it is killed and replaced. Use this when 66 | running into memory issues. (default: 5) 67 | --min-length MIN_LENGTH 68 | sentences with less than 'min_length' will not be 69 | included in the output. (default: None) 70 | -n N_WORKERS, --n-workers N_WORKERS 71 | number of workers to use (default depends on your 72 | current system). 73 | --spacy-model SPACY_MODEL 74 | spaCy model to use (must be installed). (default: 75 | en_core_web_sm) 76 | ``` 77 | 78 | ## Best settings 79 | It is hard to tell what the best settings are for a given combination of hardware and the data. 80 | On a machine with 384GB of memory and 48 cores, I ran the script with the following settings. 81 | Memory consumption never exceeded 78%. 82 | 83 | - `-n 24`: using 24 cores. 84 | - `--spacy-model en_core_web_lg`: the largest Englsih spaCy model 85 | - `-b 50000000`: a batch size of 50 MB (50,000,000 bytes). With my data, one such batch was roughly equivalent to 400k sentences 86 | - `-m 5`: replace a process after having processed 5 batches. In total each process processes 2M sentences before being replaced 87 | 88 | If you do not have a lot of memory available, you will want to set `--max-tasks-per-child` (`-m`) to 1 so that an active process is replaced after each batch. 89 | In such case, ensure that your batch size is not too small (e.g. not less than 100kB) to maximize efficiency. 90 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | from math import inf 4 | from multiprocessing import Manager, Pool, Process 5 | from os import cpu_count 6 | from pathlib import Path 7 | 8 | import psutil 9 | import spacy 10 | 11 | from Chunker import Chunker 12 | 13 | logging.basicConfig(datefmt='%d-%b %H:%M:%S', 14 | format='%(asctime)s - [%(levelname)s]: %(message)s', 15 | level=logging.INFO, 16 | handlers=[ 17 | logging.FileHandler('progress.log'), 18 | logging.StreamHandler() 19 | ]) 20 | 21 | DEFAULT_WORKERS = (cpu_count() - 2) or 1 22 | 23 | """ Processes a single, huge text file with spaCy, without running into memory issues 24 | IF the right parameters are chosen. 25 | 26 | Important parameters: 27 | - -b, --batch-size: the batch size (in bytes) to process at the same time. A larger batch, will mean that 28 | every task (in the max-tasks-per-child) will use more memory. You need to find a good balance between 29 | batch-size and max-tasks-per-child. 30 | - -m, --max-tasks-per-child: the number of batches to process before a child process is killed and replaced 31 | this will effectively free the memory used by that child process. If you are very low on memory, 32 | set this to 1, meaning that each process will only process one batch before being replaced. 33 | - -n, --n-workers: the number of child processes to spawn that will process the batches. It is important 34 | to know that the readers and writers are working in their own subprocesses, so don't use all cores 35 | for n-workers. Also, the more cores you put to work simultaneously, the more memory you will be using. 36 | On top of that, if your batch-size is too small, the reader will not be fast enough to feed all the workers. 37 | So, again, you need to find a good trade-off focused on the batch-size. 38 | - --spacy-model: it makes sense that if you use a large spaCy model, you will consume more memory. 39 | 40 | Reading input happens in chunks. The byte file pointers of each chunk are passed to the child processes, 41 | leaving them in charge of actually getting the contents from the file. 42 | This means, though, that the order of the sentence in the input file is NOT preserved. 43 | 44 | You can use this file as a template, and only change the process_batch method to your liking. 45 | That's where the actual values from spaCy are retrieved and processed. 46 | """ 47 | 48 | 49 | class Representator: 50 | def __init__(self, max_length=None, min_length=None, spacy_model='en_core_web_sm'): 51 | self.max_length = max_length if max_length else inf 52 | self.min_length = min_length if min_length else 0 53 | 54 | self.nlp = spacy.load(spacy_model, disable=['ner', 'textcat']) 55 | self.nlp.add_pipe(self._prevent_sbd, name='prevent-sbd', before='parser') 56 | 57 | self.results_q = None 58 | self.work_q = None 59 | 60 | self.chunker = None 61 | 62 | def process(self, pfin, n_workers, max_tasks_per_child): 63 | logging.info(f"Started processing {pfin.name} with {n_workers} workers.") 64 | if max_tasks_per_child: 65 | logging.info(f"Max. {max_tasks_per_child} tasks per child process before replacement.") 66 | 67 | start_time = datetime.datetime.now() 68 | 69 | total_n_sentences = 0 70 | total_n_tokens = 0 71 | with Manager() as manager: 72 | self.results_q = manager.Queue(maxsize=max(n_workers * 100, 256)) 73 | self.work_q = manager.Queue(maxsize=n_workers * 2) 74 | 75 | # Create a reader and a writer process 76 | reader_proc = Process(target=self.reader) 77 | # The reader starts filling up the work_q 78 | reader_proc.start() 79 | writer_proc = Process(target=self.writer, args=(pfin,)) 80 | writer_proc.start() 81 | 82 | with Pool(n_workers, maxtasksperchild=max_tasks_per_child) as pool: 83 | worker_jobs = [] 84 | logging.info('Chunking...') 85 | while True: 86 | # Get work from the working queue 87 | work = self.work_q.get() 88 | if work == 'done': 89 | break 90 | 91 | chunk_start, chunk_size = work 92 | # Apply work to workers 93 | job = pool.apply_async(self.process_batch, (chunk_start, chunk_size)) 94 | worker_jobs.append(job) 95 | logging.info('Done chunking...') 96 | 97 | # After the queue is 'done', the reader can close 98 | reader_proc.join() 99 | reader_proc.terminate() 100 | 101 | # When a worker has finished its job, get its information back 102 | for job_idx, job in enumerate(worker_jobs, 1): 103 | n_sentences, n_tokens = job.get() 104 | 105 | total_n_sentences += n_sentences 106 | total_n_tokens += n_tokens 107 | 108 | # Log some progress info 109 | if job_idx == 1 or job_idx % n_workers == 0: 110 | time_since_start = (datetime.datetime.now() - start_time) 111 | sents_perf = total_n_sentences // time_since_start.total_seconds() 112 | time_since_start = self._format_time(time_since_start) 113 | logging.info(f"Processed batch #{job_idx:,}: {n_sentences:,} sents ({sents_perf:,.0f} sents/s)." 114 | f" Mem. use: {psutil.virtual_memory().percent}%. Running for {time_since_start}") 115 | 116 | # Notify the writer that we're done 117 | self.results_q.put('done') 118 | 119 | writer_proc.join() 120 | writer_proc.terminate() 121 | 122 | # Log some info 123 | running_time = (datetime.datetime.now() - start_time) 124 | sents_perf = total_n_sentences // running_time.total_seconds() 125 | running_time = self._format_time(running_time) 126 | logging.info(f"Done processing in {running_time} ({sents_perf:,.0f} sentences/s)." 127 | f" Processed {total_n_sentences:,.0f} sentences and {total_n_tokens:,.0f} tokens.") 128 | 129 | def process_batch(self, chunk_start, chunk_size): 130 | batch = self.chunker.get_batch(chunk_start, chunk_size) 131 | 132 | # Parse text with spaCy 133 | docs = self.nlp.pipe(batch) 134 | # Chop into sentences 135 | spacy_sents = [sent for doc in docs for sent in doc.sents] 136 | del docs 137 | # Filter too long or too short sentences 138 | spacy_sents = [sent for sent in spacy_sents if self.min_length <= len(sent) <= self.max_length] 139 | n_sentences = len(spacy_sents) 140 | n_tokens = 0 141 | 142 | # Get some value from spaCy that we want to write to files 143 | # Here we just get the tokens, but you can change it to whatever you want 144 | sents_tok = [] 145 | for sent in spacy_sents: 146 | n_tokens += len(sent) 147 | sents_tok.append(' '.join([token.text for token in sent])) 148 | 149 | # Pass results to queue, so they can be written to file by the writer 150 | self.results_q.put(sents_tok) 151 | 152 | # Return the number of sentences and number of tokens, just to keep track 153 | # Also return first and last line. These are likely to be 'broken' sentences 154 | # due to chunking. After processing everything, we will process these 'partial 155 | # sentences' separately in the main process. 156 | return n_sentences, n_tokens 157 | 158 | @staticmethod 159 | def _prevent_sbd(doc): 160 | # If you already have one sentence per line in your file 161 | # you may wish to disable sentence segmentation with this function, 162 | # which is added to the nlp pipe in the constructor 163 | for token in doc: 164 | token.is_sent_start = False 165 | return doc 166 | 167 | @staticmethod 168 | def _format_time(delta): 169 | hours, remainder = divmod(delta.total_seconds(), 3600) 170 | minutes, seconds = divmod(remainder, 60) 171 | 172 | return f"{hours:02,.0f}:{minutes:02.0f}:{seconds:02.0f}" 173 | 174 | # I/O methods 175 | def writer(self, pfin): 176 | with open(pfin.with_suffix('.out'), 'w', encoding='utf-8') as fhout: 177 | while True: 178 | m = self.results_q.get() 179 | if m == 'done': 180 | break 181 | 182 | fhout.write('\n'.join(m) + '\n') 183 | fhout.flush() 184 | 185 | def reader(self): 186 | for chunk_tuple in self.chunker.chunkify(): 187 | self.work_q.put(chunk_tuple) 188 | 189 | self.work_q.put('done') 190 | 191 | 192 | if __name__ == '__main__': 193 | import argparse 194 | 195 | parser = argparse.ArgumentParser(description='Parse HUGE text files with spaCy in parallel without running' 196 | ' into memory issues.', 197 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 198 | parser.add_argument('fin', help='input file.') 199 | parser.add_argument('-b', '--batch-size', type=int, default=1000, 200 | help='batch size (in kilobytes).') 201 | parser.add_argument('--max-length', type=int, default=None, 202 | help="sentences with more than 'max_length' will not be included in the output.") 203 | parser.add_argument('-m', '--max-tasks-per-child', type=int, default=5, 204 | help="max number of batches that a child process can process before it is killed and replaced." 205 | " Use this when running into memory issues.") 206 | parser.add_argument('--min-length', type=int, default=None, 207 | help="sentences with less than 'min_length' will not be included in the output.") 208 | parser.add_argument('-n', '--n-workers', type=int, default=DEFAULT_WORKERS, 209 | help=f"number of workers to use (default depends on your current system).") 210 | parser.add_argument('--spacy-model', default='en_core_web_sm', 211 | help='spaCy model to use (must be installed).') 212 | args = parser.parse_args() 213 | 214 | args = vars(args) 215 | file_in = Path(args.pop('fin')).resolve() 216 | workers = args.pop('n_workers') 217 | b_size = args.pop('batch_size') 218 | max_tasks = args.pop('max_tasks_per_child') 219 | 220 | representer = Representator(**args) 221 | representer.chunker = Chunker(file_in, b_size) 222 | representer.process(file_in, workers, max_tasks) 223 | --------------------------------------------------------------------------------