├── .gitignore
├── LICENSE
├── README.md
└── src
    └── HugeFileProcessor
        ├── App.config
        ├── HugeFileProcessor.csproj
        ├── HugeFileProcessor.sln
        ├── Program.cs
        └── Properties
            └── AssemblyInfo.cs


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | [Xx]64/
 19 | [Xx]86/
 20 | [Bb]uild/
 21 | bld/
 22 | [Bb]in/
 23 | [Oo]bj/
 24 | 
 25 | # Visual Studio 2015 cache/options directory
 26 | .vs/
 27 | # Uncomment if you have tasks that create the project's static files in wwwroot
 28 | #wwwroot/
 29 | 
 30 | # MSTest test Results
 31 | [Tt]est[Rr]esult*/
 32 | [Bb]uild[Ll]og.*
 33 | 
 34 | # NUNIT
 35 | *.VisualState.xml
 36 | TestResult.xml
 37 | 
 38 | # Build Results of an ATL Project
 39 | [Dd]ebugPS/
 40 | [Rr]eleasePS/
 41 | dlldata.c
 42 | 
 43 | # DNX
 44 | project.lock.json
 45 | artifacts/
 46 | 
 47 | *_i.c
 48 | *_p.c
 49 | *_i.h
 50 | *.ilk
 51 | *.meta
 52 | *.obj
 53 | *.pch
 54 | *.pdb
 55 | *.pgc
 56 | *.pgd
 57 | *.rsp
 58 | *.sbr
 59 | *.tlb
 60 | *.tli
 61 | *.tlh
 62 | *.tmp
 63 | *.tmp_proj
 64 | *.log
 65 | *.vspscc
 66 | *.vssscc
 67 | .builds
 68 | *.pidb
 69 | *.svclog
 70 | *.scc
 71 | 
 72 | # Chutzpah Test files
 73 | _Chutzpah*
 74 | 
 75 | # Visual C++ cache files
 76 | ipch/
 77 | *.aps
 78 | *.ncb
 79 | *.opendb
 80 | *.opensdf
 81 | *.sdf
 82 | *.cachefile
 83 | *.VC.db
 84 | 
 85 | # Visual Studio profiler
 86 | *.psess
 87 | *.vsp
 88 | *.vspx
 89 | *.sap
 90 | 
 91 | # TFS 2012 Local Workspace
 92 | $tf/
 93 | 
 94 | # Guidance Automation Toolkit
 95 | *.gpState
 96 | 
 97 | # ReSharper is a .NET coding add-in
 98 | _ReSharper*/
 99 | *.[Rr]e[Ss]harper
100 | *.DotSettings.user
101 | 
102 | # JustCode is a .NET coding add-in
103 | .JustCode
104 | 
105 | # TeamCity is a build add-in
106 | _TeamCity*
107 | 
108 | # DotCover is a Code Coverage Tool
109 | *.dotCover
110 | 
111 | # NCrunch
112 | _NCrunch_*
113 | .*crunch*.local.xml
114 | nCrunchTemp_*
115 | 
116 | # MightyMoose
117 | *.mm.*
118 | AutoTest.Net/
119 | 
120 | # Web workbench (sass)
121 | .sass-cache/
122 | 
123 | # Installshield output folder
124 | [Ee]xpress/
125 | 
126 | # DocProject is a documentation generator add-in
127 | DocProject/buildhelp/
128 | DocProject/Help/*.HxT
129 | DocProject/Help/*.HxC
130 | DocProject/Help/*.hhc
131 | DocProject/Help/*.hhk
132 | DocProject/Help/*.hhp
133 | DocProject/Help/Html2
134 | DocProject/Help/html
135 | 
136 | # Click-Once directory
137 | publish/
138 | 
139 | # Publish Web Output
140 | *.[Pp]ublish.xml
141 | *.azurePubxml
142 | 
143 | # TODO: Un-comment the next line if you do not want to checkin 
144 | # your web deploy settings because they may include unencrypted
145 | # passwords
146 | #*.pubxml
147 | *.publishproj
148 | 
149 | # NuGet Packages
150 | *.nupkg
151 | # The packages folder can be ignored because of Package Restore
152 | **/packages/*
153 | # except build/, which is used as an MSBuild target.
154 | !**/packages/build/
155 | # Uncomment if necessary however generally it will be regenerated when needed
156 | #!**/packages/repositories.config
157 | # NuGet v3's project.json files produces more ignoreable files
158 | *.nuget.props
159 | *.nuget.targets
160 | 
161 | # Microsoft Azure Build Output
162 | csx/
163 | *.build.csdef
164 | 
165 | # Microsoft Azure Emulator
166 | ecf/
167 | rcf/
168 | 
169 | # Microsoft Azure ApplicationInsights config file
170 | ApplicationInsights.config
171 | 
172 | # Windows Store app package directory
173 | AppPackages/
174 | BundleArtifacts/
175 | 
176 | # Visual Studio cache files
177 | # files ending in .cache can be ignored
178 | *.[Cc]ache
179 | # but keep track of directories ending in .cache
180 | !*.[Cc]ache/
181 | 
182 | # Others
183 | ClientBin/
184 | [Ss]tyle[Cc]op.*
185 | ~$*
186 | *~
187 | *.dbmdl
188 | *.dbproj.schemaview
189 | *.pfx
190 | *.publishsettings
191 | node_modules/
192 | orleans.codegen.cs
193 | 
194 | # RIA/Silverlight projects
195 | Generated_Code/
196 | 
197 | # Backup & report files from converting an old project file
198 | # to a newer Visual Studio version. Backup files are not needed,
199 | # because we have git ;-)
200 | _UpgradeReport_Files/
201 | Backup*/
202 | UpgradeLog*.XML
203 | UpgradeLog*.htm
204 | 
205 | # SQL Server files
206 | *.mdf
207 | *.ldf
208 | 
209 | # Business Intelligence projects
210 | *.rdl.data
211 | *.bim.layout
212 | *.bim_*.settings
213 | 
214 | # Microsoft Fakes
215 | FakesAssemblies/
216 | 
217 | # GhostDoc plugin setting file
218 | *.GhostDoc.xml
219 | 
220 | # Node.js Tools for Visual Studio
221 | .ntvs_analysis.dat
222 | 
223 | # Visual Studio 6 build log
224 | *.plg
225 | 
226 | # Visual Studio 6 workspace options file
227 | *.opt
228 | 
229 | # Visual Studio LightSwitch build output
230 | **/*.HTMLClient/GeneratedArtifacts
231 | **/*.DesktopClient/GeneratedArtifacts
232 | **/*.DesktopClient/ModelManifest.xml
233 | **/*.Server/GeneratedArtifacts
234 | **/*.Server/ModelManifest.xml
235 | _Pvt_Extensions
236 | 
237 | # LightSwitch generated files
238 | GeneratedArtifacts/
239 | ModelManifest.xml
240 | 
241 | # Paket dependency manager
242 | .paket/paket.exe
243 | 
244 | # FAKE - F# Make
245 | .fake/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Mikhail Barg
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HugeFileProcessor
 2 | An utility to randomize and split really huge (100+ GB) text files
 3 | 
 4 | # Motivation
 5 | While doing some Machine Learning stuff, I've stumpled upon a need to process (mostly shuffle and split) **really huge** text files. Where "huge" might mean hundreds of gigabytes. For example the easiest way to feed data to [CNTK](https://github.com/Microsoft/CNTK) is using a text file. I've was amazed that I was not able to find any tool capable of suffling a huge file without loading it whole into RAM. So I wrote my own. Here it is.
 6 | 
 7 | # Usage
 8 | There are just a few commands that HugeFileProcessor understands:
 9 | * **`-shuffle <sourceFile> <batchSize> [<outFile>]`** 
10 |  
11 | Shuffles lines from *sourceFile* into *outFile* (or into *sourceFile.shuffled* if no *outFile* specified).
12 | 
13 | This mode requires specifying *batchSize* - number of lines to keep in RAM when writing to ouput. The more is the better (unless you are out of RAM), because total shuffling time would be _(number of lines in sourceFile) / batchSize * (time to fully read sourceFile)_. Please note that the program **shuffles whole file**, not on per-batch basis. See the details on shuffling below.
14 | 
15 | * **`-split <sourceFile> <test>/<base> [<linesLimit>]`**
16 | 
17 | Splits _sourceFile_ into _sourceFile.test_ and _sourceFile.train_. _.test_ would get _test/base_ lines of _sourceFile_, and _.train_ would get _(base-test)/base_ lines. This is done in a single pass through the _sourceFile_, so it's faster than using _head_/_tail_.
18 | 
19 | If _linesLimit_ is specified, then only first _linesLimit_ lines of _sourceFile_ are processed.
20 | 
21 | If _test_ is set to 0, then not .test file is created and all lines get into .train file. When combined with _linesLimit_ this is equal to calling `head -n <linesLimit>`.
22 | 
23 | * **`-count <sourceFile>`**
24 | 
25 | Just count number of lines in _sourceFile_.
26 | 
27 | # Running the program
28 | 
29 | The program is written in C#. It requires .Net 4.5.1 or greater. 
30 | 
31 | To run on linux use Mono: `mono HugeFileProcessor.exe <options>`
32 | 
33 | # Shuffling
34 | 
35 | Here are the details on shuffling implementation. The algorithm is as follows.
36 | 
37 | 1. Count lines in _sourceFile_. This is done simply by reaing whole file line-by-line. (See some comparisons [here](http://cc.davelozinski.com/c-sharp/fastest-way-to-read-text-files).) This also gives a measurement of how much time would it take to read whole file once. So we could estimate how many times it would take to make a complete shuffle because it would require _Ceil(linesCount / batchSize)_ complete file reads.
38 | 
39 | 2. As we now know the total _linesCount_, we can create an index array of _linesCount_ size and shuffle it using [Fisher–Yates](https://en.wikipedia.org/wiki/Fisher–Yates_shuffle) (called _orderArray_ in the code). This would give us an order in which we want to have lines in a shuffled file. Note that this is a global order over the whole file, not per batch or chunk or something.
40 | 
41 | 3. Now the actual code. We need to get all lines from _sourceFile_ in a order we just computed, but we can't read whole file in memory. So we just split the task. 
42 |  * We would go through the _sourceFile_ reading all lines and storing in memory only those lines that would be in first _batchSize_ of the _orderArray_. When we get all these lines, we could write them into _outFile_ in required order, and it's a _batchSize_/_linesCount_ of work done.
43 |  * Next we would repeat whole process again and again taking next parts of _orderArray_ and reading _sourceFile_ from start to end for each part. Eventually the whole _orderArray_ is processed and we are done.
44 | 
45 | ####Why it works?
46 | 
47 | Because all we do is just reading the source file from start to end. No seeks forward/backward, and that's what HDDs like. File gets read in chunks according to internal HDD buffers, FS blocks, CPU cahce, etc. and everything is being read sequentially.
48 | 
49 | ####Some numbers
50 | On my machine (Core i5, 16GB RAM, Win8.1, HDD Toshiba DT01ACA200 2TB, NTFS) I was able to shuffle a file of 132 GB (84 000 000 lines) in around 5 hours using _batchSize_ of 3 500 000. With _batchSize_ of 2 000 000 it took around 8 hours. Reading speed was around 118000 lines per second.
51 | 
52 |  
53 | 


--------------------------------------------------------------------------------
/src/HugeFileProcessor/App.config:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <configuration>
3 |     <startup> 
4 |         <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5.1"/>
5 |     </startup>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/src/HugeFileProcessor/HugeFileProcessor.csproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
 4 |   <PropertyGroup>
 5 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
 6 |     <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
 7 |     <ProjectGuid>{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}</ProjectGuid>
 8 |     <OutputType>Exe</OutputType>
 9 |     <AppDesignerFolder>Properties</AppDesignerFolder>
10 |     <RootNamespace>HugeFileProcessor</RootNamespace>
11 |     <AssemblyName>HugeFileProcessor</AssemblyName>
12 |     <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
13 |     <FileAlignment>512</FileAlignment>
14 |     <TargetFrameworkProfile />
15 |   </PropertyGroup>
16 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
17 |     <PlatformTarget>AnyCPU</PlatformTarget>
18 |     <DebugSymbols>true</DebugSymbols>
19 |     <DebugType>full</DebugType>
20 |     <Optimize>false</Optimize>
21 |     <OutputPath>bin\Debug\</OutputPath>
22 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
23 |     <ErrorReport>prompt</ErrorReport>
24 |     <WarningLevel>4</WarningLevel>
25 |     <Prefer32Bit>false</Prefer32Bit>
26 |     <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
27 |     <UseVSHostingProcess>false</UseVSHostingProcess>
28 |   </PropertyGroup>
29 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
30 |     <PlatformTarget>AnyCPU</PlatformTarget>
31 |     <DebugType>pdbonly</DebugType>
32 |     <Optimize>true</Optimize>
33 |     <OutputPath>bin\Release\</OutputPath>
34 |     <DefineConstants>TRACE</DefineConstants>
35 |     <ErrorReport>prompt</ErrorReport>
36 |     <WarningLevel>4</WarningLevel>
37 |     <Prefer32Bit>false</Prefer32Bit>
38 |     <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
39 |     <UseVSHostingProcess>false</UseVSHostingProcess>
40 |   </PropertyGroup>
41 |   <ItemGroup>
42 |     <Reference Include="System" />
43 |     <Reference Include="System.Core" />
44 |     <Reference Include="System.Xml.Linq" />
45 |     <Reference Include="System.Data.DataSetExtensions" />
46 |     <Reference Include="Microsoft.CSharp" />
47 |     <Reference Include="System.Data" />
48 |     <Reference Include="System.Net.Http" />
49 |     <Reference Include="System.Xml" />
50 |   </ItemGroup>
51 |   <ItemGroup>
52 |     <Compile Include="Program.cs" />
53 |     <Compile Include="Properties\AssemblyInfo.cs" />
54 |   </ItemGroup>
55 |   <ItemGroup>
56 |     <None Include="App.config" />
57 |   </ItemGroup>
58 |   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
59 |   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
60 |        Other similar extension points exist, see Microsoft.Common.targets.
61 |   <Target Name="BeforeBuild">
62 |   </Target>
63 |   <Target Name="AfterBuild">
64 |   </Target>
65 |   -->
66 | </Project>


--------------------------------------------------------------------------------
/src/HugeFileProcessor/HugeFileProcessor.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Express 14 for Windows Desktop
 4 | VisualStudioVersion = 14.0.25123.0
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HugeFileProcessor", "HugeFileProcessor.csproj", "{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | 		{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | 		{524EE32A-1411-4B60-9E88-4AEE3B8F2D33}.Release|Any CPU.Build.0 = Release|Any CPU
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | EndGlobal
23 | 


--------------------------------------------------------------------------------
/src/HugeFileProcessor/Program.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.IO;
  4 | using System.Linq;
  5 | using System.Reflection;
  6 | using System.Runtime;
  7 | using System.Text;
  8 | using System.Threading.Tasks;
  9 | 
 10 | namespace HugeFileProcessor
 11 | {
 12 |     internal sealed class Program
 13 |     {
 14 |         private const int VERBOSE_LINES_COUNT = 100000;
 15 | 
 16 |         private static Encoding encoding = Encoding.Default;
 17 | 
 18 |         private static string MonoVersion = null;
 19 | 
 20 |         static void Main(string[] args)
 21 |         {
 22 |             MonoVersion = GetMonoVersion();
 23 | 
 24 |             Console.WriteLine("Checking GC compartibility");
 25 |             RunGC();
 26 | 
 27 |             if (args.Length > 0)
 28 |             {
 29 |                 if (args[0].ToLower() == "-enc" && args.Length > 1)
 30 |                 {
 31 |                     string encodingName = args[1];
 32 |                     
 33 |                     int codePage;
 34 |                     if (Int32.TryParse(encodingName, out codePage))
 35 |                     {
 36 |                         Console.WriteLine($"Setting encoding to CodePage {codePage}");
 37 |                         encoding = Encoding.GetEncoding(codePage);
 38 |                     }
 39 |                     else
 40 |                     {
 41 |                         Console.WriteLine($"Setting encoding to {encodingName}");
 42 |                         encoding = Encoding.GetEncoding(encodingName);
 43 |                     }
 44 | 
 45 |                     string[] argsCopy = new string[args.Length - 2];
 46 |                     Array.Copy(args, 2, argsCopy, 0, argsCopy.Length);
 47 |                     args = argsCopy;
 48 |                 }
 49 |             }
 50 |             Console.WriteLine($"Encoding is {encoding.WebName}");
 51 |             Console.WriteLine();
 52 | 
 53 |             if (args.Length > 0)
 54 |             {
 55 |                 switch (args[0].ToLower())
 56 |                 {
 57 |                 case "-shuffle":
 58 |                     if (args.Length == 3 || args.Length == 4)
 59 |                     {
 60 |                         MainShuffle(args[1], Int32.Parse(args[2]), args.Length == 4 ? args[3] : args[1] + ".shuffled");
 61 |                         return;
 62 |                     }
 63 |                     break;
 64 |                 case "-split":
 65 |                     if (args.Length == 3 || args.Length == 4)
 66 |                     {
 67 |                         string[] splitFracStr = args[2].Split('/');     //a fraction of test data in form "1/10". To skip creation of test data, use "0/1"
 68 |                         int testFracUp = Int32.Parse(splitFracStr[0]);
 69 |                         int testFracDown = Int32.Parse(splitFracStr[1]);
 70 |                         int processLinesLimit = args.Length == 4 ? Int32.Parse(args[3]) : -1;   //number of lines to process from file or -1 if all.
 71 | 
 72 |                         MainSplit(args[1], testFracUp, testFracDown, processLinesLimit);
 73 |                         return;
 74 |                     }
 75 |                     break;
 76 |                 case "-count":
 77 |                     if (args.Length == 2)
 78 |                     {
 79 |                         TimeSpan singlePassTime;
 80 |                         int linesCount = CountLines(args[1], out singlePassTime);
 81 |                         Console.WriteLine();
 82 |                         Console.WriteLine(linesCount);
 83 |                         return;
 84 |                     }
 85 |                     break;
 86 |                 case "-chunk":
 87 |                     if (args.Length == 3 || args.Length == 4)
 88 |                     {
 89 |                         int chunkSize = Int32.Parse(args[2]);
 90 |                         int numberOfDigits = -1;
 91 |                         if (args.Length == 4)
 92 |                         {
 93 |                             numberOfDigits = Int32.Parse(args[3]);
 94 |                         }
 95 |                         Chunkify(args[1], chunkSize, numberOfDigits);
 96 |                     }
 97 |                     break;
 98 |                 }
 99 |             }
100 |             Console.WriteLine("Usage :");
101 |             Console.WriteLine(" HugeFileProcessor.exe [-enc <encoding_name>] <command> <params>");
102 |             Console.WriteLine("");
103 |             Console.WriteLine("Commands :");
104 |             Console.WriteLine(" -split <sourceFile> <test>/<base> [<linesLimit>]\n\tsplits <sourceFile> to test and train, so that test file get (<test>/<base>) fraction of lines.\n\tSet 0 to <test> to skip test file creation.\n\t<linesLimit> - total number of lines to proces from <sourceFile>, set to -1 or skip to read all lines.\n\n");
105 |             Console.WriteLine(" -shuffle <sourceFile> <batchSize> [<outFile>]\n\tshuffles lines from <sourceFile> to <outFile>.\n\t<batchSize> is in lines.\n\n");
106 |             Console.WriteLine(" -count <sourceFile>\n\tjust count lines int <sourceFile>\n\n");
107 |             Console.WriteLine(" -chunk <sourceFile> <chunkSize> [<numberOfDigits>]\n\tsplit <sourceFile> into chunks, each having <chunkSize> lines.\n\tUse <numberOfDigits> to specify number fo digits in the resulting filename. If not specified, will count lines first\n\n");
108 |             Environment.Exit(1);
109 |         }
110 | 
111 |         #region shuffle
112 |         private static void MainShuffle(string sourceFileName, int batchSizeLines, string targetFileName)
113 |         {
114 |             Console.WriteLine($"Shuffling lines from {sourceFileName} to {targetFileName} in batches of {batchSizeLines:N0}");
115 |             Console.WriteLine();
116 | 
117 |             TimeSpan singlePassTime;
118 |             int linesCount = CountLines(sourceFileName, out singlePassTime);
119 |             int batchesCount = (int)Math.Ceiling(linesCount * 1.0 / batchSizeLines);
120 |             Console.WriteLine();
121 |             Console.WriteLine($"Expecting {batchesCount:N0} batches, that would take {TimeSpan.FromSeconds(batchesCount * singlePassTime.TotalSeconds)}");
122 |             Console.WriteLine();
123 | 
124 |             int[] orderArray = GetOrderArray(linesCount);
125 |             Console.WriteLine();
126 | 
127 |             Console.WriteLine("Writing to file");
128 |             DateTime start = DateTime.UtcNow;
129 |             using (StreamWriter writer = new StreamWriter(targetFileName, false, encoding))
130 |             {
131 |                 int batchIndex = 0;
132 |                 for (int batchStart = 0; batchStart < linesCount; batchStart += batchSizeLines)
133 |                 {
134 |                     ++batchIndex;
135 |                     Console.WriteLine($"Starting batch {batchIndex}");
136 |                     int batchEnd = batchStart + batchSizeLines - 1;
137 |                     if (batchEnd >= linesCount)
138 |                     {
139 |                         batchEnd = linesCount - 1;
140 |                     }
141 |                     ProcessBatch(sourceFileName, orderArray, batchStart, batchEnd, writer);
142 |                     TimeSpan took = DateTime.UtcNow - start;
143 |                     Console.WriteLine($"Batch done, took {took}, speed is {batchEnd / took.TotalSeconds:N0} lps. Remaining {TimeSpan.FromSeconds((batchesCount - batchIndex) * took.TotalSeconds / batchIndex)}");
144 |                     Console.WriteLine();
145 |                 }
146 |             }
147 |             Console.WriteLine($"Done, took {DateTime.UtcNow - start}");
148 |         }
149 | 
150 |         private static void ProcessBatch(string sourceFileName, int[] orderArray, int batchStart, int batchEnd, StreamWriter writer)
151 |         {
152 |             int batchSize = batchEnd - batchStart + 1;
153 |             KeyValuePair<int, int>[] batchLines = new KeyValuePair<int, int>[batchSize];
154 |             for (int i = 0; i < batchSize; ++i)
155 |             {
156 |                 batchLines[i] = new KeyValuePair<int, int>(orderArray[batchStart + i], i);
157 |             }
158 |             Array.Sort(batchLines, (a, b) => a.Key.CompareTo(b.Key));
159 | 
160 | 
161 |             string[] writeLines = new string[batchSize];
162 | 
163 | 
164 |             //using (StreamReader reader = File.OpenText(sourceFileName))
165 |             using (StreamReader reader = new StreamReader(sourceFileName, encoding))
166 |             {
167 |                 int lineIndex = -1;
168 |                 foreach (KeyValuePair<int, int> pair in batchLines)
169 |                 {
170 |                     string s = null;
171 |                     while (lineIndex < pair.Key)
172 |                     {
173 |                         s = reader.ReadLine();
174 |                         ++lineIndex;
175 |                     }
176 |                     writeLines[pair.Value] = s;
177 |                 }
178 |             }
179 | 
180 |             foreach (string writeLine in writeLines)
181 |             {
182 |                 writer.WriteLine(writeLine);
183 |             }
184 | 
185 |             RunGC();
186 |         }
187 | 
188 |         private static int CountLines(string fileName, out TimeSpan totalTime)
189 |         {
190 |             Console.WriteLine($"Counting lines in {fileName}");
191 |             DateTime start = DateTime.UtcNow;
192 | 
193 |             int linesCount = 0;
194 |             foreach (string s in File.ReadLines(fileName, encoding))
195 |             {
196 |                 if (string.IsNullOrWhiteSpace(s))
197 |                 {
198 |                     continue;
199 |                 }
200 |                 ++linesCount;
201 | 
202 |                 if (linesCount % VERBOSE_LINES_COUNT == 0)
203 |                 {
204 |                     TimeSpan took = DateTime.UtcNow - start;
205 |                     Console.WriteLine($"Current count is {linesCount:N0}, took {took}, speed is {linesCount / took.TotalSeconds:N0} lps");
206 |                 }
207 |             }
208 |             totalTime = DateTime.UtcNow - start;
209 |             Console.WriteLine($"Done. Lines count is {linesCount:N0}, took {totalTime}, speed is {linesCount / totalTime.TotalSeconds:N0} lps");
210 | 
211 |             return linesCount;
212 |         }
213 | 
214 |         private static int[] GetOrderArray(int linesCount)
215 |         {
216 |             Console.WriteLine("Creating order array");
217 |             DateTime start = DateTime.UtcNow;
218 | 
219 |             int[] orderArray = new int[linesCount];
220 |             Random rnd = new Random();
221 |             for (int i = 0; i < linesCount; ++i)
222 |             {
223 |                 orderArray[i] = i;
224 |             }
225 |             for (int i = 0; i < linesCount - 1; ++i)
226 |             {
227 |                 int j = i + rnd.Next(linesCount - i);
228 |                 int tmp = orderArray[i];
229 |                 orderArray[i] = orderArray[j];
230 |                 orderArray[j] = tmp;
231 |             }
232 |             Console.WriteLine($"Done, took {DateTime.UtcNow - start}");
233 |             return orderArray;
234 |         }
235 |         #endregion
236 | 
237 |         #region split
238 |         private static void MainSplit(string sourceFileName, int testFracUp, int testFracDown, int processLinesLimit)
239 |         {
240 |             if (testFracUp > 0)
241 |             {
242 |                 Console.WriteLine($"Splitting lines from {sourceFileName} into .test and .train parts. Test gets {testFracUp}/{testFracDown}, train gets {testFracDown-testFracUp}/{testFracDown}");
243 |             }
244 |             else
245 |             {
246 |                 Console.WriteLine($"Processing lines from {sourceFileName} into .train");
247 |             }
248 |             string outFileInfix = "";
249 |             if (processLinesLimit < 0)
250 |             {
251 |                 processLinesLimit = Int32.MaxValue;
252 |             }
253 |             else
254 |             {
255 |                 outFileInfix = "." + processLinesLimit.ToString();
256 |                 Console.WriteLine($"Limiting total lines number to {processLinesLimit}");
257 |             }
258 | 
259 |             using (StreamWriter trainWriter = new StreamWriter(sourceFileName + outFileInfix + ".train", false, encoding))
260 |             {
261 |                 using (StreamWriter testWriter = testFracUp <= 0 ? null : new StreamWriter(sourceFileName + outFileInfix + ".test", false, encoding))
262 |                 {
263 |                     int lineIndex = 0;
264 |                     foreach (string sourceLine in File.ReadLines(sourceFileName, encoding))
265 |                     {
266 |                         ++lineIndex;
267 |                         if ((lineIndex - 1) % testFracDown < testFracUp)
268 |                         {
269 |                             testWriter.WriteLine(sourceLine);
270 |                         }
271 |                         else
272 |                         {
273 |                             trainWriter.WriteLine(sourceLine);
274 |                         }
275 |                         if (lineIndex % VERBOSE_LINES_COUNT == 0)
276 |                         {
277 |                             Console.WriteLine($"Processed {lineIndex} lines");
278 |                         }
279 |                         if (lineIndex >= processLinesLimit)
280 |                         {
281 |                             Console.WriteLine($"Processed up to limit. Stopping");
282 |                             break;
283 |                         }
284 |                     }
285 |                     Console.WriteLine($"Finished. Processed {lineIndex} lines");
286 |                 }
287 |             }
288 |         }
289 |         #endregion
290 | 
291 |         private static void Chunkify(string sourceFileName, int chunkSize, int numberOfDigits)
292 |         {
293 |             Console.WriteLine($"Splitting {sourceFileName} into chunks each, having {chunkSize} lines");
294 |             int chunksCount = -1;
295 |             if (numberOfDigits < 0)
296 |             {
297 |                 Console.WriteLine($"numberOfDigits is not specified, counting lines");
298 |                 TimeSpan timespan;
299 |                 int linesCount = CountLines(sourceFileName, out timespan);
300 |                 chunksCount = (linesCount / chunkSize) + (linesCount % chunkSize > 0 ? 1 : 0);
301 |                 int tmp = chunksCount;
302 |                 numberOfDigits = 0;
303 |                 while (tmp > 0)
304 |                 {
305 |                     ++numberOfDigits;
306 |                     tmp /= 10;
307 |                 }
308 |                 Console.WriteLine($"File contains {linesCount}, which gives {chunksCount} and {numberOfDigits} digits");
309 |             }
310 | 
311 |             Console.WriteLine($"Starting");
312 |             int lineIndex = 0;
313 |             int chunkIndex = 1;
314 |             StreamWriter writer = null;
315 |             try
316 |             {
317 |                 NewChunkWriter(ref writer, sourceFileName, chunkIndex, chunksCount, numberOfDigits);
318 |                 foreach (string sourceLine in File.ReadLines(sourceFileName, encoding))
319 |                 {
320 |                     ++lineIndex;
321 |                     if (lineIndex > chunkSize)
322 |                     {
323 |                         lineIndex = 0;
324 |                         ++chunkIndex;
325 |                         NewChunkWriter(ref writer, sourceFileName, chunkIndex, chunksCount, numberOfDigits);
326 |                     }
327 |                     writer.WriteLine(sourceLine);
328 |                 }
329 |                 writer.Close();
330 |                 writer = null;
331 |             }
332 |             finally
333 |             {
334 |                 if (writer != null)
335 |                 {
336 |                     writer.Close();
337 |                     writer = null;
338 |                 }
339 |             }
340 |             Console.WriteLine($"Done");
341 |         }
342 | 
343 |         private static void NewChunkWriter(ref StreamWriter writer, string sourceFileName, int chunkIndex, int chunksCount, int numberOfDigits)
344 |         {
345 |             if (writer != null)
346 |             {
347 |                 writer.Close();
348 |                 writer = null;
349 |             }
350 |             string fileName = Path.ChangeExtension(sourceFileName, chunkIndex.ToString("D" + numberOfDigits) + Path.GetExtension(sourceFileName));
351 |             Console.WriteLine($"Writing to file {Path.GetFileName(fileName)}, chunk {chunkIndex} of " + (chunksCount >= 0 ? chunksCount.ToString() : "?"));
352 |             writer = new StreamWriter(fileName, false, encoding);
353 |         }
354 | 
355 |         public static string GetMonoVersion()
356 |         {
357 |             Console.WriteLine("System.Environment.Version = " + System.Environment.Version);
358 |             Type monoRuntimeType = Type.GetType("Mono.Runtime");
359 |             if (monoRuntimeType == null)
360 |             {
361 |                 Console.WriteLine("Not running on Mono");
362 |                 return null;
363 |             }
364 |             //see http://stackoverflow.com/questions/8413922/programmatically-determining-mono-runtime-version
365 |             MethodInfo mi = monoRuntimeType.GetMethod("GetDisplayName", BindingFlags.NonPublic | BindingFlags.Static | BindingFlags.IgnoreCase);
366 |             string result = (string)mi.Invoke(null, null);
367 | 
368 |             //sample strings are:
369 |             //3.2.8 (Debian 3.2.8+dfsg-4ubuntu1.1)
370 |             //
371 |             Console.WriteLine("Got Mono version: '" + result + "'");
372 |             return result;
373 |         }
374 | 
375 |         public static void RunGC()
376 |         {
377 |             Console.WriteLine($"Starting GC");
378 |             DateTime started = DateTime.UtcNow;
379 | 
380 |             if (MonoVersion == null || !MonoVersion.StartsWith("3"))
381 |             {
382 |                 //no LargeObjectHeapCompactionMode in Mono 3
383 |                 SetupLohMode();
384 |             }
385 |             GC.Collect();
386 |             GC.WaitForPendingFinalizers();
387 |             Console.WriteLine($"D");
388 |             if (MonoVersion == null)
389 |             {
390 |                 //WaitForFullGCComplete throws NotImplementedException in Mono 3 and 4
391 |                 GC.WaitForFullGCComplete();
392 |             }
393 |             else
394 |             {
395 |                 Console.WriteLine("Mono detected, skipping GC.WaitForFullGCComplete()");
396 |             }
397 |             GC.Collect();
398 |             Console.WriteLine($"GC collection including LOH done in {DateTime.UtcNow - started}");
399 |         }
400 | 
401 |         //Need separate func othervise it would not work on Mono3:
402 |         //throws System.TypeLoadException: Could not load type 'System.Runtime.GCLargeObjectHeapCompactionMode'
403 |         public static void SetupLohMode()
404 |         {
405 |             GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce;
406 |         }
407 |     }
408 | }
409 | 


--------------------------------------------------------------------------------
/src/HugeFileProcessor/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Reflection;
 2 | using System.Runtime.CompilerServices;
 3 | using System.Runtime.InteropServices;
 4 | 
 5 | // General Information about an assembly is controlled through the following 
 6 | // set of attributes. Change these attribute values to modify the information
 7 | // associated with an assembly.
 8 | [assembly: AssemblyTitle("HugeFileProcessor")]
 9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("HugeFileProcessor")]
13 | [assembly: AssemblyCopyright("Copyright ©  2016")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 | 
17 | // Setting ComVisible to false makes the types in this assembly not visible 
18 | // to COM components.  If you need to access a type in this assembly from 
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 | 
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("524ee32a-1411-4b60-9e88-4aee3b8f2d33")]
24 | 
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | //      Major Version
28 | //      Minor Version 
29 | //      Build Number
30 | //      Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers 
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.2.0.0")]
36 | [assembly: AssemblyFileVersion("1.2.0.0")]
37 | 


--------------------------------------------------------------------------------