├── .gitignore ├── LICENSE ├── README.md └── src └── HugeFileProcessor ├── App.config ├── HugeFileProcessor.csproj ├── HugeFileProcessor.sln ├── Program.cs └── Properties └── AssemblyInfo.cs /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | [Xx]64/ 19 | [Xx]86/ 20 | [Bb]uild/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | artifacts/ 46 | 47 | *_i.c 48 | *_p.c 49 | *_i.h 50 | *.ilk 51 | *.meta 52 | *.obj 53 | *.pch 54 | *.pdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *.log 65 | *.vspscc 66 | *.vssscc 67 | .builds 68 | *.pidb 69 | *.svclog 70 | *.scc 71 | 72 | # Chutzpah Test files 73 | _Chutzpah* 74 | 75 | # Visual C++ cache files 76 | ipch/ 77 | *.aps 78 | *.ncb 79 | *.opendb 80 | *.opensdf 81 | *.sdf 82 | *.cachefile 83 | *.VC.db 84 | 85 | # Visual Studio profiler 86 | *.psess 87 | *.vsp 88 | *.vspx 89 | *.sap 90 | 91 | # TFS 2012 Local Workspace 92 | $tf/ 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | *.DotSettings.user 101 | 102 | # JustCode is a .NET coding add-in 103 | .JustCode 104 | 105 | # TeamCity is a build add-in 106 | _TeamCity* 107 | 108 | # DotCover is a Code Coverage Tool 109 | *.dotCover 110 | 111 | # NCrunch 112 | _NCrunch_* 113 | .*crunch*.local.xml 114 | nCrunchTemp_* 115 | 116 | # MightyMoose 117 | *.mm.* 118 | AutoTest.Net/ 119 | 120 | # Web workbench (sass) 121 | .sass-cache/ 122 | 123 | # Installshield output folder 124 | [Ee]xpress/ 125 | 126 | # DocProject is a documentation generator add-in 127 | DocProject/buildhelp/ 128 | DocProject/Help/*.HxT 129 | DocProject/Help/*.HxC 130 | DocProject/Help/*.hhc 131 | DocProject/Help/*.hhk 132 | DocProject/Help/*.hhp 133 | DocProject/Help/Html2 134 | DocProject/Help/html 135 | 136 | # Click-Once directory 137 | publish/ 138 | 139 | # Publish Web Output 140 | *.[Pp]ublish.xml 141 | *.azurePubxml 142 | 143 | # TODO: Un-comment the next line if you do not want to checkin 144 | # your web deploy settings because they may include unencrypted 145 | # passwords 146 | #*.pubxml 147 | *.publishproj 148 | 149 | # NuGet Packages 150 | *.nupkg 151 | # The packages folder can be ignored because of Package Restore 152 | **/packages/* 153 | # except build/, which is used as an MSBuild target. 154 | !**/packages/build/ 155 | # Uncomment if necessary however generally it will be regenerated when needed 156 | #!**/packages/repositories.config 157 | # NuGet v3's project.json files produces more ignoreable files 158 | *.nuget.props 159 | *.nuget.targets 160 | 161 | # Microsoft Azure Build Output 162 | csx/ 163 | *.build.csdef 164 | 165 | # Microsoft Azure Emulator 166 | ecf/ 167 | rcf/ 168 | 169 | # Microsoft Azure ApplicationInsights config file 170 | ApplicationInsights.config 171 | 172 | # Windows Store app package directory 173 | AppPackages/ 174 | BundleArtifacts/ 175 | 176 | # Visual Studio cache files 177 | # files ending in .cache can be ignored 178 | *.[Cc]ache 179 | # but keep track of directories ending in .cache 180 | !*.[Cc]ache/ 181 | 182 | # Others 183 | ClientBin/ 184 | [Ss]tyle[Cc]op.* 185 | ~$* 186 | *~ 187 | *.dbmdl 188 | *.dbproj.schemaview 189 | *.pfx 190 | *.publishsettings 191 | node_modules/ 192 | orleans.codegen.cs 193 | 194 | # RIA/Silverlight projects 195 | Generated_Code/ 196 | 197 | # Backup & report files from converting an old project file 198 | # to a newer Visual Studio version. Backup files are not needed, 199 | # because we have git ;-) 200 | _UpgradeReport_Files/ 201 | Backup*/ 202 | UpgradeLog*.XML 203 | UpgradeLog*.htm 204 | 205 | # SQL Server files 206 | *.mdf 207 | *.ldf 208 | 209 | # Business Intelligence projects 210 | *.rdl.data 211 | *.bim.layout 212 | *.bim_*.settings 213 | 214 | # Microsoft Fakes 215 | FakesAssemblies/ 216 | 217 | # GhostDoc plugin setting file 218 | *.GhostDoc.xml 219 | 220 | # Node.js Tools for Visual Studio 221 | .ntvs_analysis.dat 222 | 223 | # Visual Studio 6 build log 224 | *.plg 225 | 226 | # Visual Studio 6 workspace options file 227 | *.opt 228 | 229 | # Visual Studio LightSwitch build output 230 | **/*.HTMLClient/GeneratedArtifacts 231 | **/*.DesktopClient/GeneratedArtifacts 232 | **/*.DesktopClient/ModelManifest.xml 233 | **/*.Server/GeneratedArtifacts 234 | **/*.Server/ModelManifest.xml 235 | _Pvt_Extensions 236 | 237 | # LightSwitch generated files 238 | GeneratedArtifacts/ 239 | ModelManifest.xml 240 | 241 | # Paket dependency manager 242 | .paket/paket.exe 243 | 244 | # FAKE - F# Make 245 | .fake/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Mikhail Barg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HugeFileProcessor 2 | An utility to randomize and split really huge (100+ GB) text files 3 | 4 | # Motivation 5 | While doing some Machine Learning stuff, I've stumpled upon a need to process (mostly shuffle and split) **really huge** text files. Where "huge" might mean hundreds of gigabytes. For example the easiest way to feed data to [CNTK](https://github.com/Microsoft/CNTK) is using a text file. I've was amazed that I was not able to find any tool capable of suffling a huge file without loading it whole into RAM. So I wrote my own. Here it is. 6 | 7 | # Usage 8 | There are just a few commands that HugeFileProcessor understands: 9 | * **`-shuffle []`** 10 | 11 | Shuffles lines from *sourceFile* into *outFile* (or into *sourceFile.shuffled* if no *outFile* specified). 12 | 13 | This mode requires specifying *batchSize* - number of lines to keep in RAM when writing to ouput. The more is the better (unless you are out of RAM), because total shuffling time would be _(number of lines in sourceFile) / batchSize * (time to fully read sourceFile)_. Please note that the program **shuffles whole file**, not on per-batch basis. See the details on shuffling below. 14 | 15 | * **`-split / []`** 16 | 17 | Splits _sourceFile_ into _sourceFile.test_ and _sourceFile.train_. _.test_ would get _test/base_ lines of _sourceFile_, and _.train_ would get _(base-test)/base_ lines. This is done in a single pass through the _sourceFile_, so it's faster than using _head_/_tail_. 18 | 19 | If _linesLimit_ is specified, then only first _linesLimit_ lines of _sourceFile_ are processed. 20 | 21 | If _test_ is set to 0, then not .test file is created and all lines get into .train file. When combined with _linesLimit_ this is equal to calling `head -n `. 22 | 23 | * **`-count `** 24 | 25 | Just count number of lines in _sourceFile_. 26 | 27 | # Running the program 28 | 29 | The program is written in C#. It requires .Net 4.5.1 or greater. 30 | 31 | To run on linux use Mono: `mono HugeFileProcessor.exe ` 32 | 33 | # Shuffling 34 | 35 | Here are the details on shuffling implementation. The algorithm is as follows. 36 | 37 | 1. Count lines in _sourceFile_. This is done simply by reaing whole file line-by-line. (See some comparisons [here](http://cc.davelozinski.com/c-sharp/fastest-way-to-read-text-files).) This also gives a measurement of how much time would it take to read whole file once. So we could estimate how many times it would take to make a complete shuffle because it would require _Ceil(linesCount / batchSize)_ complete file reads. 38 | 39 | 2. As we now know the total _linesCount_, we can create an index array of _linesCount_ size and shuffle it using [Fisher–Yates](https://en.wikipedia.org/wiki/Fisher–Yates_shuffle) (called _orderArray_ in the code). This would give us an order in which we want to have lines in a shuffled file. Note that this is a global order over the whole file, not per batch or chunk or something. 40 | 41 | 3. Now the actual code. We need to get all lines from _sourceFile_ in a order we just computed, but we can't read whole file in memory. So we just split the task. 42 | * We would go through the _sourceFile_ reading all lines and storing in memory only those lines that would be in first _batchSize_ of the _orderArray_. When we get all these lines, we could write them into _outFile_ in required order, and it's a _batchSize_/_linesCount_ of work done. 43 | * Next we would repeat whole process again and again taking next parts of _orderArray_ and reading _sourceFile_ from start to end for each part. Eventually the whole _orderArray_ is processed and we are done. 44 | 45 | ####Why it works? 46 | 47 | Because all we do is just reading the source file from start to end. No seeks forward/backward, and that's what HDDs like. File gets read in chunks according to internal HDD buffers, FS blocks, CPU cahce, etc. and everything is being read sequentially. 48 | 49 | ####Some numbers 50 | On my machine (Core i5, 16GB RAM, Win8.1, HDD Toshiba DT01ACA200 2TB, NTFS) I was able to shuffle a file of 132 GB (84 000 000 lines) in around 5 hours using _batchSize_ of 3 500 000. With _batchSize_ of 2 000 000 it took around 8 hours. Reading speed was around 118000 lines per second. 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/HugeFileProcessor/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/HugeFileProcessor/HugeFileProcessor.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {524EE32A-1411-4B60-9E88-4AEE3B8F2D33} 8 | Exe 9 | Properties 10 | HugeFileProcessor 11 | HugeFileProcessor 12 | v4.5.1 13 | 512 14 | 15 | 16 | 17 | AnyCPU 18 | true 19 | full 20 | false 21 | bin\Debug\ 22 | DEBUG;TRACE 23 | prompt 24 | 4 25 | false 26 | true 27 | false 28 | 29 | 30 | AnyCPU 31 | pdbonly 32 | true 33 | bin\Release\ 34 | TRACE 35 | prompt 36 | 4 37 | false 38 | true 39 | false 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 66 | -------------------------------------------------------------------------------- /src/HugeFileProcessor/HugeFileProcessor.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Express 14 for Windows Desktop 4 | VisualStudioVersion = 14.0.25123.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HugeFileProcessor", "HugeFileProcessor.csproj", "{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /src/HugeFileProcessor/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Reflection; 6 | using System.Runtime; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace HugeFileProcessor 11 | { 12 | internal sealed class Program 13 | { 14 | private const int VERBOSE_LINES_COUNT = 100000; 15 | 16 | private static Encoding encoding = Encoding.Default; 17 | 18 | private static string MonoVersion = null; 19 | 20 | static void Main(string[] args) 21 | { 22 | MonoVersion = GetMonoVersion(); 23 | 24 | Console.WriteLine("Checking GC compartibility"); 25 | RunGC(); 26 | 27 | if (args.Length > 0) 28 | { 29 | if (args[0].ToLower() == "-enc" && args.Length > 1) 30 | { 31 | string encodingName = args[1]; 32 | 33 | int codePage; 34 | if (Int32.TryParse(encodingName, out codePage)) 35 | { 36 | Console.WriteLine($"Setting encoding to CodePage {codePage}"); 37 | encoding = Encoding.GetEncoding(codePage); 38 | } 39 | else 40 | { 41 | Console.WriteLine($"Setting encoding to {encodingName}"); 42 | encoding = Encoding.GetEncoding(encodingName); 43 | } 44 | 45 | string[] argsCopy = new string[args.Length - 2]; 46 | Array.Copy(args, 2, argsCopy, 0, argsCopy.Length); 47 | args = argsCopy; 48 | } 49 | } 50 | Console.WriteLine($"Encoding is {encoding.WebName}"); 51 | Console.WriteLine(); 52 | 53 | if (args.Length > 0) 54 | { 55 | switch (args[0].ToLower()) 56 | { 57 | case "-shuffle": 58 | if (args.Length == 3 || args.Length == 4) 59 | { 60 | MainShuffle(args[1], Int32.Parse(args[2]), args.Length == 4 ? args[3] : args[1] + ".shuffled"); 61 | return; 62 | } 63 | break; 64 | case "-split": 65 | if (args.Length == 3 || args.Length == 4) 66 | { 67 | string[] splitFracStr = args[2].Split('/'); //a fraction of test data in form "1/10". To skip creation of test data, use "0/1" 68 | int testFracUp = Int32.Parse(splitFracStr[0]); 69 | int testFracDown = Int32.Parse(splitFracStr[1]); 70 | int processLinesLimit = args.Length == 4 ? Int32.Parse(args[3]) : -1; //number of lines to process from file or -1 if all. 71 | 72 | MainSplit(args[1], testFracUp, testFracDown, processLinesLimit); 73 | return; 74 | } 75 | break; 76 | case "-count": 77 | if (args.Length == 2) 78 | { 79 | TimeSpan singlePassTime; 80 | int linesCount = CountLines(args[1], out singlePassTime); 81 | Console.WriteLine(); 82 | Console.WriteLine(linesCount); 83 | return; 84 | } 85 | break; 86 | case "-chunk": 87 | if (args.Length == 3 || args.Length == 4) 88 | { 89 | int chunkSize = Int32.Parse(args[2]); 90 | int numberOfDigits = -1; 91 | if (args.Length == 4) 92 | { 93 | numberOfDigits = Int32.Parse(args[3]); 94 | } 95 | Chunkify(args[1], chunkSize, numberOfDigits); 96 | } 97 | break; 98 | } 99 | } 100 | Console.WriteLine("Usage :"); 101 | Console.WriteLine(" HugeFileProcessor.exe [-enc ] "); 102 | Console.WriteLine(""); 103 | Console.WriteLine("Commands :"); 104 | Console.WriteLine(" -split / []\n\tsplits to test and train, so that test file get (/) fraction of lines.\n\tSet 0 to to skip test file creation.\n\t - total number of lines to proces from , set to -1 or skip to read all lines.\n\n"); 105 | Console.WriteLine(" -shuffle []\n\tshuffles lines from to .\n\t is in lines.\n\n"); 106 | Console.WriteLine(" -count \n\tjust count lines int \n\n"); 107 | Console.WriteLine(" -chunk []\n\tsplit into chunks, each having lines.\n\tUse to specify number fo digits in the resulting filename. If not specified, will count lines first\n\n"); 108 | Environment.Exit(1); 109 | } 110 | 111 | #region shuffle 112 | private static void MainShuffle(string sourceFileName, int batchSizeLines, string targetFileName) 113 | { 114 | Console.WriteLine($"Shuffling lines from {sourceFileName} to {targetFileName} in batches of {batchSizeLines:N0}"); 115 | Console.WriteLine(); 116 | 117 | TimeSpan singlePassTime; 118 | int linesCount = CountLines(sourceFileName, out singlePassTime); 119 | int batchesCount = (int)Math.Ceiling(linesCount * 1.0 / batchSizeLines); 120 | Console.WriteLine(); 121 | Console.WriteLine($"Expecting {batchesCount:N0} batches, that would take {TimeSpan.FromSeconds(batchesCount * singlePassTime.TotalSeconds)}"); 122 | Console.WriteLine(); 123 | 124 | int[] orderArray = GetOrderArray(linesCount); 125 | Console.WriteLine(); 126 | 127 | Console.WriteLine("Writing to file"); 128 | DateTime start = DateTime.UtcNow; 129 | using (StreamWriter writer = new StreamWriter(targetFileName, false, encoding)) 130 | { 131 | int batchIndex = 0; 132 | for (int batchStart = 0; batchStart < linesCount; batchStart += batchSizeLines) 133 | { 134 | ++batchIndex; 135 | Console.WriteLine($"Starting batch {batchIndex}"); 136 | int batchEnd = batchStart + batchSizeLines - 1; 137 | if (batchEnd >= linesCount) 138 | { 139 | batchEnd = linesCount - 1; 140 | } 141 | ProcessBatch(sourceFileName, orderArray, batchStart, batchEnd, writer); 142 | TimeSpan took = DateTime.UtcNow - start; 143 | Console.WriteLine($"Batch done, took {took}, speed is {batchEnd / took.TotalSeconds:N0} lps. Remaining {TimeSpan.FromSeconds((batchesCount - batchIndex) * took.TotalSeconds / batchIndex)}"); 144 | Console.WriteLine(); 145 | } 146 | } 147 | Console.WriteLine($"Done, took {DateTime.UtcNow - start}"); 148 | } 149 | 150 | private static void ProcessBatch(string sourceFileName, int[] orderArray, int batchStart, int batchEnd, StreamWriter writer) 151 | { 152 | int batchSize = batchEnd - batchStart + 1; 153 | KeyValuePair[] batchLines = new KeyValuePair[batchSize]; 154 | for (int i = 0; i < batchSize; ++i) 155 | { 156 | batchLines[i] = new KeyValuePair(orderArray[batchStart + i], i); 157 | } 158 | Array.Sort(batchLines, (a, b) => a.Key.CompareTo(b.Key)); 159 | 160 | 161 | string[] writeLines = new string[batchSize]; 162 | 163 | 164 | //using (StreamReader reader = File.OpenText(sourceFileName)) 165 | using (StreamReader reader = new StreamReader(sourceFileName, encoding)) 166 | { 167 | int lineIndex = -1; 168 | foreach (KeyValuePair pair in batchLines) 169 | { 170 | string s = null; 171 | while (lineIndex < pair.Key) 172 | { 173 | s = reader.ReadLine(); 174 | ++lineIndex; 175 | } 176 | writeLines[pair.Value] = s; 177 | } 178 | } 179 | 180 | foreach (string writeLine in writeLines) 181 | { 182 | writer.WriteLine(writeLine); 183 | } 184 | 185 | RunGC(); 186 | } 187 | 188 | private static int CountLines(string fileName, out TimeSpan totalTime) 189 | { 190 | Console.WriteLine($"Counting lines in {fileName}"); 191 | DateTime start = DateTime.UtcNow; 192 | 193 | int linesCount = 0; 194 | foreach (string s in File.ReadLines(fileName, encoding)) 195 | { 196 | if (string.IsNullOrWhiteSpace(s)) 197 | { 198 | continue; 199 | } 200 | ++linesCount; 201 | 202 | if (linesCount % VERBOSE_LINES_COUNT == 0) 203 | { 204 | TimeSpan took = DateTime.UtcNow - start; 205 | Console.WriteLine($"Current count is {linesCount:N0}, took {took}, speed is {linesCount / took.TotalSeconds:N0} lps"); 206 | } 207 | } 208 | totalTime = DateTime.UtcNow - start; 209 | Console.WriteLine($"Done. Lines count is {linesCount:N0}, took {totalTime}, speed is {linesCount / totalTime.TotalSeconds:N0} lps"); 210 | 211 | return linesCount; 212 | } 213 | 214 | private static int[] GetOrderArray(int linesCount) 215 | { 216 | Console.WriteLine("Creating order array"); 217 | DateTime start = DateTime.UtcNow; 218 | 219 | int[] orderArray = new int[linesCount]; 220 | Random rnd = new Random(); 221 | for (int i = 0; i < linesCount; ++i) 222 | { 223 | orderArray[i] = i; 224 | } 225 | for (int i = 0; i < linesCount - 1; ++i) 226 | { 227 | int j = i + rnd.Next(linesCount - i); 228 | int tmp = orderArray[i]; 229 | orderArray[i] = orderArray[j]; 230 | orderArray[j] = tmp; 231 | } 232 | Console.WriteLine($"Done, took {DateTime.UtcNow - start}"); 233 | return orderArray; 234 | } 235 | #endregion 236 | 237 | #region split 238 | private static void MainSplit(string sourceFileName, int testFracUp, int testFracDown, int processLinesLimit) 239 | { 240 | if (testFracUp > 0) 241 | { 242 | Console.WriteLine($"Splitting lines from {sourceFileName} into .test and .train parts. Test gets {testFracUp}/{testFracDown}, train gets {testFracDown-testFracUp}/{testFracDown}"); 243 | } 244 | else 245 | { 246 | Console.WriteLine($"Processing lines from {sourceFileName} into .train"); 247 | } 248 | string outFileInfix = ""; 249 | if (processLinesLimit < 0) 250 | { 251 | processLinesLimit = Int32.MaxValue; 252 | } 253 | else 254 | { 255 | outFileInfix = "." + processLinesLimit.ToString(); 256 | Console.WriteLine($"Limiting total lines number to {processLinesLimit}"); 257 | } 258 | 259 | using (StreamWriter trainWriter = new StreamWriter(sourceFileName + outFileInfix + ".train", false, encoding)) 260 | { 261 | using (StreamWriter testWriter = testFracUp <= 0 ? null : new StreamWriter(sourceFileName + outFileInfix + ".test", false, encoding)) 262 | { 263 | int lineIndex = 0; 264 | foreach (string sourceLine in File.ReadLines(sourceFileName, encoding)) 265 | { 266 | ++lineIndex; 267 | if ((lineIndex - 1) % testFracDown < testFracUp) 268 | { 269 | testWriter.WriteLine(sourceLine); 270 | } 271 | else 272 | { 273 | trainWriter.WriteLine(sourceLine); 274 | } 275 | if (lineIndex % VERBOSE_LINES_COUNT == 0) 276 | { 277 | Console.WriteLine($"Processed {lineIndex} lines"); 278 | } 279 | if (lineIndex >= processLinesLimit) 280 | { 281 | Console.WriteLine($"Processed up to limit. Stopping"); 282 | break; 283 | } 284 | } 285 | Console.WriteLine($"Finished. Processed {lineIndex} lines"); 286 | } 287 | } 288 | } 289 | #endregion 290 | 291 | private static void Chunkify(string sourceFileName, int chunkSize, int numberOfDigits) 292 | { 293 | Console.WriteLine($"Splitting {sourceFileName} into chunks each, having {chunkSize} lines"); 294 | int chunksCount = -1; 295 | if (numberOfDigits < 0) 296 | { 297 | Console.WriteLine($"numberOfDigits is not specified, counting lines"); 298 | TimeSpan timespan; 299 | int linesCount = CountLines(sourceFileName, out timespan); 300 | chunksCount = (linesCount / chunkSize) + (linesCount % chunkSize > 0 ? 1 : 0); 301 | int tmp = chunksCount; 302 | numberOfDigits = 0; 303 | while (tmp > 0) 304 | { 305 | ++numberOfDigits; 306 | tmp /= 10; 307 | } 308 | Console.WriteLine($"File contains {linesCount}, which gives {chunksCount} and {numberOfDigits} digits"); 309 | } 310 | 311 | Console.WriteLine($"Starting"); 312 | int lineIndex = 0; 313 | int chunkIndex = 1; 314 | StreamWriter writer = null; 315 | try 316 | { 317 | NewChunkWriter(ref writer, sourceFileName, chunkIndex, chunksCount, numberOfDigits); 318 | foreach (string sourceLine in File.ReadLines(sourceFileName, encoding)) 319 | { 320 | ++lineIndex; 321 | if (lineIndex > chunkSize) 322 | { 323 | lineIndex = 0; 324 | ++chunkIndex; 325 | NewChunkWriter(ref writer, sourceFileName, chunkIndex, chunksCount, numberOfDigits); 326 | } 327 | writer.WriteLine(sourceLine); 328 | } 329 | writer.Close(); 330 | writer = null; 331 | } 332 | finally 333 | { 334 | if (writer != null) 335 | { 336 | writer.Close(); 337 | writer = null; 338 | } 339 | } 340 | Console.WriteLine($"Done"); 341 | } 342 | 343 | private static void NewChunkWriter(ref StreamWriter writer, string sourceFileName, int chunkIndex, int chunksCount, int numberOfDigits) 344 | { 345 | if (writer != null) 346 | { 347 | writer.Close(); 348 | writer = null; 349 | } 350 | string fileName = Path.ChangeExtension(sourceFileName, chunkIndex.ToString("D" + numberOfDigits) + Path.GetExtension(sourceFileName)); 351 | Console.WriteLine($"Writing to file {Path.GetFileName(fileName)}, chunk {chunkIndex} of " + (chunksCount >= 0 ? chunksCount.ToString() : "?")); 352 | writer = new StreamWriter(fileName, false, encoding); 353 | } 354 | 355 | public static string GetMonoVersion() 356 | { 357 | Console.WriteLine("System.Environment.Version = " + System.Environment.Version); 358 | Type monoRuntimeType = Type.GetType("Mono.Runtime"); 359 | if (monoRuntimeType == null) 360 | { 361 | Console.WriteLine("Not running on Mono"); 362 | return null; 363 | } 364 | //see http://stackoverflow.com/questions/8413922/programmatically-determining-mono-runtime-version 365 | MethodInfo mi = monoRuntimeType.GetMethod("GetDisplayName", BindingFlags.NonPublic | BindingFlags.Static | BindingFlags.IgnoreCase); 366 | string result = (string)mi.Invoke(null, null); 367 | 368 | //sample strings are: 369 | //3.2.8 (Debian 3.2.8+dfsg-4ubuntu1.1) 370 | // 371 | Console.WriteLine("Got Mono version: '" + result + "'"); 372 | return result; 373 | } 374 | 375 | public static void RunGC() 376 | { 377 | Console.WriteLine($"Starting GC"); 378 | DateTime started = DateTime.UtcNow; 379 | 380 | if (MonoVersion == null || !MonoVersion.StartsWith("3")) 381 | { 382 | //no LargeObjectHeapCompactionMode in Mono 3 383 | SetupLohMode(); 384 | } 385 | GC.Collect(); 386 | GC.WaitForPendingFinalizers(); 387 | Console.WriteLine($"D"); 388 | if (MonoVersion == null) 389 | { 390 | //WaitForFullGCComplete throws NotImplementedException in Mono 3 and 4 391 | GC.WaitForFullGCComplete(); 392 | } 393 | else 394 | { 395 | Console.WriteLine("Mono detected, skipping GC.WaitForFullGCComplete()"); 396 | } 397 | GC.Collect(); 398 | Console.WriteLine($"GC collection including LOH done in {DateTime.UtcNow - started}"); 399 | } 400 | 401 | //Need separate func othervise it would not work on Mono3: 402 | //throws System.TypeLoadException: Could not load type 'System.Runtime.GCLargeObjectHeapCompactionMode' 403 | public static void SetupLohMode() 404 | { 405 | GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; 406 | } 407 | } 408 | } 409 | -------------------------------------------------------------------------------- /src/HugeFileProcessor/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("HugeFileProcessor")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("HugeFileProcessor")] 13 | [assembly: AssemblyCopyright("Copyright © 2016")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("524ee32a-1411-4b60-9e88-4aee3b8f2d33")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.2.0.0")] 36 | [assembly: AssemblyFileVersion("1.2.0.0")] 37 | --------------------------------------------------------------------------------