├── f_sigmoid.r ├── f_log_loss.r ├── f_normalize_to_01.r ├── README.md ├── .gitattributes ├── prep_data.py ├── chunk_by_authors.py ├── split_by_authors.py ├── .gitignore └── LICENSE /f_sigmoid.r: -------------------------------------------------------------------------------- 1 | sigmoid = function( x ) { 2 | z <- 1 / ( 1 + exp( -x )) 3 | } 4 | 5 | -------------------------------------------------------------------------------- /f_log_loss.r: -------------------------------------------------------------------------------- 1 | log_loss <- function(actual, predicted, eps=0.00001) { 2 | predicted <- pmin(pmax(predicted, eps), 1-eps) 3 | -1/length(actual)*(sum(actual*log(predicted)+(1-actual)*log(1-predicted))) 4 | } 5 | -------------------------------------------------------------------------------- /f_normalize_to_01.r: -------------------------------------------------------------------------------- 1 | normalize_to_01 <- function( p ) { 2 | 3 | max_p = max( p ) 4 | min_p = min( p ) 5 | 6 | p_norm = p - min_p 7 | p_norm = p_norm / ( max_p - min_p ) 8 | 9 | return( p_norm ) 10 | 11 | } 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Gender discrimination based on handwriting 2 | ========================================== 3 | 4 | See [http://fastml.com/gender-discrimination/](http://fastml.com/gender-discrimination/) for description. 5 | 6 | prep_data.py - a first step 7 | chunk_by_authors.py - a utility for preparing a validation set 8 | split_by_authors.py - a utility for preparing a validation set 9 | f_sigmoid.r - sigmoid function 10 | f_log_loss.r - log loss function for scoring 11 | f_normalize_to_01.r - a utility function for using with linearRidge() 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /prep_data.py: -------------------------------------------------------------------------------- 1 | 'prepare a training file from train.csv and train_answers.csv' 2 | 3 | import sys, csv 4 | 5 | output_file = sys.argv[1] 6 | 7 | input_file = 'data/orig/train.csv' 8 | answers_file = 'data/orig/train_answers.csv' 9 | 10 | # mapping author -> gender 11 | 12 | writers = {} 13 | reader = csv.reader( open( answers_file )) 14 | headers = reader.next() 15 | 16 | for line in reader: 17 | writer_id, gender = line 18 | writers[writer_id] = gender 19 | 20 | ### 21 | 22 | reader = csv.reader( open( input_file )) 23 | writer = csv.writer( open( output_file, 'wb' )) 24 | 25 | # prep headers 26 | headers = reader.next() 27 | headers = headers[2:] 28 | headers.insert( 0, 'gender' ) 29 | writer.writerow( headers ) 30 | 31 | for line in reader: 32 | 33 | if line[2] == 'Arabic': 34 | line[2] = 0 35 | else: 36 | line[2] = 1 37 | 38 | writer_id = line[0] 39 | gender = writers[writer_id] 40 | 41 | line = line[2:] 42 | line.insert( 0, gender ) 43 | 44 | writer.writerow( line ) 45 | 46 | -------------------------------------------------------------------------------- /chunk_by_authors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | split a file into a given number of chunks randomly, by authors. 3 | Usage: chunk_by_authors.py 4 | input files with headers 5 | no headers in output files 6 | ''' 7 | 8 | import sys, random, os, csv 9 | 10 | orig_train_file = sys.argv[1] 11 | input_file = sys.argv[2] 12 | num_chunks = int( sys.argv[3] ) 13 | 14 | try: 15 | seed = sys.argv[4] 16 | except IndexError: 17 | seed = None 18 | if seed: 19 | random.seed( seed ) 20 | 21 | basename = os.path.basename( input_file ) 22 | basename, ext = os.path.splitext( basename ) 23 | 24 | i_orig = open( orig_train_file ) 25 | orig_reader = csv.reader( i_orig ) 26 | i = open( input_file ) 27 | 28 | headers = orig_reader.next() 29 | i.next() 30 | 31 | os = {} 32 | for n in range( num_chunks ): 33 | output_file = "%s_%s%s" % ( basename, n, ext ) 34 | os[n] = open( output_file, 'wb' ) 35 | # os[n].write( headers ) 36 | 37 | counter = 0 38 | current_writer = None 39 | 40 | 41 | for line in i: 42 | 43 | orig_line = orig_reader.next() 44 | writer = orig_line[0] 45 | 46 | if writer != current_writer: 47 | current_writer = writer 48 | n = random.randint( 0, num_chunks - 1 ) 49 | 50 | os[n].write( line ) 51 | 52 | counter += 1 53 | if counter % 100000 == 0: 54 | print counter 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /split_by_authors.py: -------------------------------------------------------------------------------- 1 | 'like split, but take writers from the original file into account' 2 | 'input file with headers, output files without headers' 3 | 4 | import csv 5 | import sys 6 | import random 7 | 8 | orig_train_file = sys.argv[1] 9 | input_file = sys.argv[2] 10 | output_file1 = sys.argv[3] 11 | output_file2 = sys.argv[4] 12 | 13 | try: 14 | P = float( sys.argv[5] ) 15 | except IndexError: 16 | P = 0.9 17 | 18 | try: 19 | seed = sys.argv[6] 20 | except IndexError: 21 | seed = None 22 | 23 | print "P = %s" % ( P ) 24 | 25 | if seed: 26 | random.seed( seed ) 27 | 28 | i_orig = open( orig_train_file ) 29 | i = open( input_file ) 30 | o1 = open( output_file1, 'wb' ) 31 | o2 = open( output_file2, 'wb' ) 32 | 33 | orig_reader = csv.reader( i_orig ) 34 | reader = csv.reader( i ) 35 | writer1 = csv.writer( o1 ) 36 | writer2 = csv.writer( o2 ) 37 | 38 | headers = reader.next() 39 | orig_reader.next() 40 | #writer1.writerow( headers ) 41 | #writer2.writerow( headers ) 42 | 43 | counter = 0 44 | current_writer = None 45 | 46 | for line in reader: 47 | 48 | orig_line = orig_reader.next() 49 | writer = orig_line[0] 50 | 51 | if writer != current_writer: 52 | 53 | current_writer = writer 54 | r = random.random() 55 | if r > P: 56 | w = writer2 57 | else: 58 | w = writer1 59 | 60 | w.writerow( line ) 61 | 62 | counter += 1 63 | if counter % 100000 == 0: 64 | print counter 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | [Dd]ebug/ 46 | [Rr]elease/ 47 | *_i.c 48 | *_p.c 49 | *.ilk 50 | *.meta 51 | *.obj 52 | *.pch 53 | *.pdb 54 | *.pgc 55 | *.pgd 56 | *.rsp 57 | *.sbr 58 | *.tlb 59 | *.tli 60 | *.tlh 61 | *.tmp 62 | *.vspscc 63 | .builds 64 | *.dotCover 65 | 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 67 | #packages/ 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | 76 | # Visual Studio profiler 77 | *.psess 78 | *.vsp 79 | 80 | # ReSharper is a .NET coding add-in 81 | _ReSharper* 82 | 83 | # Installshield output folder 84 | [Ee]xpress 85 | 86 | # DocProject is a documentation generator add-in 87 | DocProject/buildhelp/ 88 | DocProject/Help/*.HxT 89 | DocProject/Help/*.HxC 90 | DocProject/Help/*.hhc 91 | DocProject/Help/*.hhk 92 | DocProject/Help/*.hhp 93 | DocProject/Help/Html2 94 | DocProject/Help/html 95 | 96 | # Click-Once directory 97 | publish 98 | 99 | # Others 100 | [Bb]in 101 | [Oo]bj 102 | sql 103 | TestResults 104 | *.Cache 105 | ClientBin 106 | stylecop.* 107 | ~$* 108 | *.dbmdl 109 | Generated_Code #added for RIA/Silverlight projects 110 | 111 | # Backup & report files from converting an old project file to a newer 112 | # Visual Studio version. Backup files are not needed, because we have git ;-) 113 | _UpgradeReport_Files/ 114 | Backup*/ 115 | UpgradeLog*.XML 116 | 117 | 118 | 119 | ############ 120 | ## Windows 121 | ############ 122 | 123 | # Windows image file caches 124 | Thumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | 130 | ############# 131 | ## Python 132 | ############# 133 | 134 | *.py[co] 135 | 136 | # Packages 137 | *.egg 138 | *.egg-info 139 | dist 140 | build 141 | eggs 142 | parts 143 | bin 144 | var 145 | sdist 146 | develop-eggs 147 | .installed.cfg 148 | 149 | # Installer logs 150 | pip-log.txt 151 | 152 | # Unit test / coverage reports 153 | .coverage 154 | .tox 155 | 156 | #Translations 157 | *.mo 158 | 159 | #Mr Developer 160 | .mr.developer.cfg 161 | 162 | # Mac crap 163 | .DS_Store 164 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Artistic License 2.0 2 | 3 | Copyright (c) 2013 Zygmunt Zając 4 | 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | This license establishes the terms under which a given free software 11 | Package may be copied, modified, distributed, and/or redistributed. 12 | The intent is that the Copyright Holder maintains some artistic 13 | control over the development of that Package while still keeping the 14 | Package available as open source and free software. 15 | 16 | You are always permitted to make arrangements wholly outside of this 17 | license directly with the Copyright Holder of a given Package. If the 18 | terms of this license do not permit the full use that you propose to 19 | make of the Package, you should contact the Copyright Holder and seek 20 | a different licensing arrangement. 21 | 22 | Definitions 23 | 24 | "Copyright Holder" means the individual(s) or organization(s) 25 | named in the copyright notice for the entire Package. 26 | 27 | "Contributor" means any party that has contributed code or other 28 | material to the Package, in accordance with the Copyright Holder's 29 | procedures. 30 | 31 | "You" and "your" means any person who would like to copy, 32 | distribute, or modify the Package. 33 | 34 | "Package" means the collection of files distributed by the 35 | Copyright Holder, and derivatives of that collection and/or of 36 | those files. A given Package may consist of either the Standard 37 | Version, or a Modified Version. 38 | 39 | "Distribute" means providing a copy of the Package or making it 40 | accessible to anyone else, or in the case of a company or 41 | organization, to others outside of your company or organization. 42 | 43 | "Distributor Fee" means any fee that you charge for Distributing 44 | this Package or providing support for this Package to another 45 | party. It does not mean licensing fees. 46 | 47 | "Standard Version" refers to the Package if it has not been 48 | modified, or has been modified only in ways explicitly requested 49 | by the Copyright Holder. 50 | 51 | "Modified Version" means the Package, if it has been changed, and 52 | such changes were not explicitly requested by the Copyright 53 | Holder. 54 | 55 | "Original License" means this Artistic License as Distributed with 56 | the Standard Version of the Package, in its current version or as 57 | it may be modified by The Perl Foundation in the future. 58 | 59 | "Source" form means the source code, documentation source, and 60 | configuration files for the Package. 61 | 62 | "Compiled" form means the compiled bytecode, object code, binary, 63 | or any other form resulting from mechanical transformation or 64 | translation of the Source form. 65 | 66 | 67 | Permission for Use and Modification Without Distribution 68 | 69 | (1) You are permitted to use the Standard Version and create and use 70 | Modified Versions for any purpose without restriction, provided that 71 | you do not Distribute the Modified Version. 72 | 73 | 74 | Permissions for Redistribution of the Standard Version 75 | 76 | (2) You may Distribute verbatim copies of the Source form of the 77 | Standard Version of this Package in any medium without restriction, 78 | either gratis or for a Distributor Fee, provided that you duplicate 79 | all of the original copyright notices and associated disclaimers. At 80 | your discretion, such verbatim copies may or may not include a 81 | Compiled form of the Package. 82 | 83 | (3) You may apply any bug fixes, portability changes, and other 84 | modifications made available from the Copyright Holder. The resulting 85 | Package will still be considered the Standard Version, and as such 86 | will be subject to the Original License. 87 | 88 | 89 | Distribution of Modified Versions of the Package as Source 90 | 91 | (4) You may Distribute your Modified Version as Source (either gratis 92 | or for a Distributor Fee, and with or without a Compiled form of the 93 | Modified Version) provided that you clearly document how it differs 94 | from the Standard Version, including, but not limited to, documenting 95 | any non-standard features, executables, or modules, and provided that 96 | you do at least ONE of the following: 97 | 98 | (a) make the Modified Version available to the Copyright Holder 99 | of the Standard Version, under the Original License, so that the 100 | Copyright Holder may include your modifications in the Standard 101 | Version. 102 | 103 | (b) ensure that installation of your Modified Version does not 104 | prevent the user installing or running the Standard Version. In 105 | addition, the Modified Version must bear a name that is different 106 | from the name of the Standard Version. 107 | 108 | (c) allow anyone who receives a copy of the Modified Version to 109 | make the Source form of the Modified Version available to others 110 | under 111 | 112 | (i) the Original License or 113 | 114 | (ii) a license that permits the licensee to freely copy, 115 | modify and redistribute the Modified Version using the same 116 | licensing terms that apply to the copy that the licensee 117 | received, and requires that the Source form of the Modified 118 | Version, and of any works derived from it, be made freely 119 | available in that license fees are prohibited but Distributor 120 | Fees are allowed. 121 | 122 | 123 | Distribution of Compiled Forms of the Standard Version 124 | or Modified Versions without the Source 125 | 126 | (5) You may Distribute Compiled forms of the Standard Version without 127 | the Source, provided that you include complete instructions on how to 128 | get the Source of the Standard Version. Such instructions must be 129 | valid at the time of your distribution. If these instructions, at any 130 | time while you are carrying out such distribution, become invalid, you 131 | must provide new instructions on demand or cease further distribution. 132 | If you provide valid instructions or cease distribution within thirty 133 | days after you become aware that the instructions are invalid, then 134 | you do not forfeit any of your rights under this license. 135 | 136 | (6) You may Distribute a Modified Version in Compiled form without 137 | the Source, provided that you comply with Section 4 with respect to 138 | the Source of the Modified Version. 139 | 140 | 141 | Aggregating or Linking the Package 142 | 143 | (7) You may aggregate the Package (either the Standard Version or 144 | Modified Version) with other packages and Distribute the resulting 145 | aggregation provided that you do not charge a licensing fee for the 146 | Package. Distributor Fees are permitted, and licensing fees for other 147 | components in the aggregation are permitted. The terms of this license 148 | apply to the use and Distribution of the Standard or Modified Versions 149 | as included in the aggregation. 150 | 151 | (8) You are permitted to link Modified and Standard Versions with 152 | other works, to embed the Package in a larger work of your own, or to 153 | build stand-alone binary or bytecode versions of applications that 154 | include the Package, and Distribute the result without restriction, 155 | provided the result does not expose a direct interface to the Package. 156 | 157 | 158 | Items That are Not Considered Part of a Modified Version 159 | 160 | (9) Works (including, but not limited to, modules and scripts) that 161 | merely extend or make use of the Package, do not, by themselves, cause 162 | the Package to be a Modified Version. In addition, such works are not 163 | considered parts of the Package itself, and are not subject to the 164 | terms of this license. 165 | 166 | 167 | General Provisions 168 | 169 | (10) Any use, modification, and distribution of the Standard or 170 | Modified Versions is governed by this Artistic License. By using, 171 | modifying or distributing the Package, you accept this license. Do not 172 | use, modify, or distribute the Package, if you do not accept this 173 | license. 174 | 175 | (11) If your Modified Version has been derived from a Modified 176 | Version made by someone other than you, you are nevertheless required 177 | to ensure that your Modified Version complies with the requirements of 178 | this license. 179 | 180 | (12) This license does not grant you the right to use any trademark, 181 | service mark, tradename, or logo of the Copyright Holder. 182 | 183 | (13) This license includes the non-exclusive, worldwide, 184 | free-of-charge patent license to make, have made, use, offer to sell, 185 | sell, import and otherwise transfer the Package with respect to any 186 | patent claims licensable by the Copyright Holder that are necessarily 187 | infringed by the Package. If you institute patent litigation 188 | (including a cross-claim or counterclaim) against any party alleging 189 | that the Package constitutes direct or contributory patent 190 | infringement, then this Artistic License to you shall terminate on the 191 | date that such litigation is filed. 192 | 193 | (14) Disclaimer of Warranty: 194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS 195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED 196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL 198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL 199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL 200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF 201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 202 | --------------------------------------------------------------------------------