├── LICENSE ├── Name.php ├── Parser.php ├── README.md ├── Tests ├── NameTest.php └── testNames.txt ├── composer.json ├── index.php ├── init.php └── nbproject ├── private ├── private.properties └── private.xml ├── project.properties └── project.xml /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2010 Jason Priem 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Name.php: -------------------------------------------------------------------------------- 1 | setStr($str); 13 | } 14 | 15 | /** 16 | * Checks encoding, normalizes whitespace/punctuation, and sets the name string. 17 | * 18 | * @param String $str a utf8-encoding string. 19 | * @return Bool True on success 20 | */ 21 | public function setStr($str) 22 | { 23 | if (!mb_check_encoding($str)){ 24 | throw new Exception("Name is not encoded in UTF-8"); 25 | } 26 | $this->str = $str; 27 | $this->norm(); 28 | return true; 29 | } 30 | 31 | public function getStr() 32 | { 33 | return $this->str; 34 | } 35 | 36 | 37 | /** 38 | * Uses a regex to chop off and return part of the namestring 39 | * There are two parts: first, it returns the matched substring, 40 | * and then it removes that substring from $this->str and normalizes. 41 | * 42 | * @param string $regex matches the part of the namestring to chop off 43 | * @param integer $submatchIndex which of the parenthesized submatches to use 44 | * @param string $regexFlags optional regex flags 45 | * @return string the part of the namestring that got chopped off 46 | */ 47 | public function chopWithRegex($regex, $submatchIndex = 0, $regexFlags = '') 48 | { 49 | $regex = $regex . "ui" . $regexFlags; // unicode + case-insensitive 50 | preg_match($regex, $this->str, $m); 51 | $subset = (isset($m[$submatchIndex])) ? $m[$submatchIndex] : ''; 52 | 53 | if ($subset){ 54 | $this->str = preg_replace($regex, ' ', $this->str, -1, $numReplacements); 55 | if ($numReplacements > 1){ 56 | throw new Exception("The regex being used to find the name has multiple matches."); 57 | } 58 | $this->norm(); 59 | return $subset; 60 | } 61 | else { 62 | return ''; 63 | } 64 | } 65 | 66 | /* 67 | * Flips the front and back parts of a name with one another. 68 | * Front and back are determined by a specified character somewhere in the 69 | * middle of the string. 70 | * 71 | * @param String $flipAroundChar the character(s) demarcating the two halves you want to flip. 72 | * @return Bool True on success. 73 | */ 74 | public function flip($flipAroundChar) 75 | { 76 | $substrings = preg_split("/$flipAroundChar/u", $this->str); 77 | if (count($substrings) == 2){ 78 | $this->str = $substrings[1] . " " . $substrings[0]; 79 | $this->norm(); 80 | } 81 | else if (count($substrings) > 2) { 82 | throw new Exception("Can't flip around multiple '$flipAroundChar' characters in namestring."); 83 | } 84 | return true; // if there's 1 or 0 $flipAroundChar found 85 | } 86 | 87 | /** 88 | * Removes extra whitespace and punctuation from $this->str 89 | * Strips whitespace chars from ends, strips redundant whitespace, converts whitespace chars to " ". 90 | * 91 | * @return Bool True on success 92 | */ 93 | private function norm() 94 | { 95 | $this->str = preg_replace( "#^\s*#u", "", $this->str ); 96 | $this->str = preg_replace( "#\s*$#u", "", $this->str ); 97 | if (substr_count($this->str, "\xc2\xa0") == 0) $this->str = preg_replace( "#\s+#u", " ", $this->str ); 98 | $this->str = preg_replace( "#,$#u", " ", $this->str ); 99 | return true; 100 | } 101 | } 102 | ?> 103 | -------------------------------------------------------------------------------- /Parser.php: -------------------------------------------------------------------------------- 1 | getLast() . ", " . $parser->getFirst(); 8 | * //returns "Smith, John" 9 | * 10 | * 11 | */ 12 | class HumanNameParser_Parser { 13 | private $name; 14 | private $leadingInit; 15 | private $first; 16 | private $nicknames; 17 | private $middle; 18 | private $last; 19 | private $suffix; 20 | 21 | private $suffixes; 22 | private $prefixes; 23 | 24 | /* 25 | * Constructor 26 | * 27 | * @param mixed $name Either a name as a string or as a Name object. 28 | */ 29 | public function __construct($name = NULL) 30 | { 31 | $this->setName($name); 32 | } 33 | 34 | /** 35 | * Sets name string and parses it. 36 | * Takes Name object or a simple string (converts the string into a Name obj), 37 | * parses and loads its constituant parts. 38 | * 39 | * @param mixed $name Either a name as a string or as a Name object. 40 | */ 41 | public function setName($name = NULL){ 42 | if ($name) { 43 | 44 | if (is_object($name) && get_class($name) == "HumanNameParser_Name") { // this is mostly for testing 45 | $this->name = $name; 46 | } 47 | else { 48 | $this->name = new HumanNameParser_Name($name); 49 | } 50 | 51 | $this->leadingInit = ""; 52 | $this->first = ""; 53 | $this->nicknames = ""; 54 | $this->middle = ""; 55 | $this->last = ""; 56 | $this->suffix = ""; 57 | 58 | $this->suffixes = array('esq','esquire','jr','sr','2','ii','iii','iv', 'v', 'phd'); 59 | $this->prefixes = array('bar','ben','bin','da','dal','de la', 'de', 'del','der','di', 60 | 'ibn','la','le','san','st','ste','van', 'van der', 'van den', 'vel','von'); 61 | 62 | $this->parse(); 63 | } 64 | } 65 | 66 | public function getleadingInit() { 67 | return $this->leadingInit; 68 | } 69 | public function getFirst() { 70 | return $this->first; 71 | } 72 | public function getNicknames() { 73 | return $this->nicknames; 74 | } 75 | 76 | public function getMiddle() { 77 | return $this->middle; 78 | } 79 | 80 | public function getLast() { 81 | return $this->last; 82 | } 83 | 84 | public function getSuffix() { 85 | return $this->suffix; 86 | } 87 | public function getName(){ 88 | return $this->name; 89 | } 90 | 91 | /** 92 | * returns all the parts of the name as an array 93 | * 94 | * @param String $arrType pass 'int' to get an integer-indexed array (default is associative) 95 | * @return array An array of the name-parts 96 | */ 97 | public function getArray($arrType = 'assoc') { 98 | $arr = array(); 99 | $arr['leadingInit'] = $this->leadingInit; 100 | $arr['first'] = $this->first; 101 | $arr['nicknames'] = $this->nicknames; 102 | $arr['middle'] = $this->middle; 103 | $arr['last'] = $this->last; 104 | $arr['suffix'] = $this->suffix; 105 | if ($arrType == 'assoc') { 106 | return $arr; 107 | } 108 | else if ($arrType == 'int'){ 109 | return array_values($arr); 110 | } 111 | else { 112 | throw new Exception("Array must be associative ('assoc') or numeric ('num')."); 113 | } 114 | } 115 | 116 | /* 117 | * Parse the name into its constituent parts. 118 | * 119 | * Sequentially captures each name-part, working in from the ends and 120 | * trimming the namestring as it goes. 121 | * 122 | * @return boolean true on success 123 | */ 124 | private function parse() 125 | { 126 | $suffixes = implode("\.*|", $this->suffixes) . "\.*"; // each suffix gets a "\.*" behind it. 127 | $prefixes = implode(" |", $this->prefixes) . " "; // each prefix gets a " " behind it. 128 | 129 | // The regex use is a bit tricky. *Everything* matched by the regex will be replaced, 130 | // but you can select a particular parenthesized submatch to be returned. 131 | // Also, note that each regex requres that the preceding ones have been run, and matches chopped out. 132 | $nicknamesRegex = "/ ('|\"|\(\"*'*)(.+?)('|\"|\"*'*\)) /"; // names that starts or end w/ an apostrophe break this 133 | $suffixRegex = "/,* *($suffixes)$/"; 134 | $lastRegex = "/(?!^)\b([^ ]+ y |$prefixes)*[^ ]+$/"; 135 | $leadingInitRegex = "/^(.\.*)(?= \p{L}{2})/"; // note the lookahead, which isn't returned or replaced 136 | $firstRegex = "/^[^ ]+/"; // 137 | 138 | // get nickname, if there is one 139 | $this->nicknames = $this->name->chopWithRegex($nicknamesRegex, 2); 140 | 141 | // get suffix, if there is one 142 | $this->suffix = $this->name->chopWithRegex($suffixRegex, 1); 143 | 144 | // get the first initial, if there is one 145 | $this->leadingInit = $this->name->chopWithRegex($leadingInitRegex, 1); 146 | 147 | // flip the before-comma and after-comma parts of the name 148 | $this->name->flip(","); 149 | 150 | // get the last name 151 | $this->last = $this->name->chopWithRegex($lastRegex, 0); 152 | if (!$this->last){ 153 | throw new Exception("Couldn't find a last name in '{$this->name->getStr()}'."); 154 | } 155 | 156 | // get the first name 157 | $this->first = $this->name->chopWithRegex($firstRegex, 0); 158 | if (!$this->first){ 159 | throw new Exception("Couldn't find a first name in '{$this->name->getStr()}'"); 160 | } 161 | 162 | // if anything's left, that's the middle name 163 | $this->middle = $this->name->getStr(); 164 | return true; 165 | } 166 | 167 | 168 | 169 | 170 | 171 | } 172 | ?> 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Name: HumanNameParse.php 2 | 3 | Version: 0.2 4 | 5 | Date: 6 Sept. 2010 6 | 7 | Author: Jason Priem 8 | 9 | Website: 10 | 11 | License: 12 | 13 | 14 | # Description 15 | Takes human names of arbitrary complexity and various wacky formats like: 16 | 17 | * J. Walter Weatherman 18 | * de la Cruz, Ana M. 19 | * James C. ('Jimmy') O'Dell, Jr. 20 | 21 | and parses out the: 22 | 23 | * leading initial (Like "J." in "J. Walter Weatherman") 24 | * first name (or first initial in a name like 'R. Crumb') 25 | * nicknames (like "Jimmy" in "James C. ('Jimmy') O'Dell, Jr.") 26 | * middle names 27 | * last name (including compound ones like "van der Sar' and "Ortega y Gasset"), and 28 | * suffix (like 'Jr.', 'III') 29 | 30 | -------------------------------------------------------------------------------- /Tests/NameTest.php: -------------------------------------------------------------------------------- 1 | object = new Name("Björn O'Malley"); 11 | } 12 | 13 | public function testSetStrRemovesWhitespaceAtEnds() { 14 | $this->object->setStr(" Björn O'Malley \r\n"); 15 | $this->assertEquals( 16 | "Björn O'Malley", 17 | $this->object->getStr() 18 | ); 19 | } 20 | public function testSetStrRemovesRedudentantWhitespace(){ 21 | $this->object->setStr(" Björn O'Malley"); //tab between names 22 | $this->assertEquals( 23 | "Björn O'Malley", 24 | $this->object->getStr() 25 | ); 26 | } 27 | 28 | public function testChopWithRegexReturnsChoppedSubstring(){ 29 | $this->object->setStr("Björn O'Malley"); 30 | $this->assertEquals( 31 | 'Björn', 32 | $this->object->chopWithRegex('/^([^ ]+)(.+)/', 1) 33 | ); 34 | } 35 | 36 | public function testChopWithRegexChopsStartOffNameStr(){ 37 | $this->object->setStr("Björn O'Malley"); 38 | $this->object->chopWithRegex('/^[^ ]+/', 0); 39 | $this->assertEquals( 40 | "O'Malley", 41 | $this->object->getStr() 42 | ); 43 | } 44 | public function testChopWithRegexChopsEndOffNameStr(){ 45 | $this->object->setStr("Björn O'Malley"); 46 | $this->object->chopWithRegex('/ (.+)$/', 1); 47 | $this->assertEquals( 48 | 'Björn', 49 | $this->object->getStr() 50 | ); 51 | } 52 | public function testChopWithRegexChopsMiddleFromNameStr(){ 53 | $this->object->setStr("Björn 'Bill' O'Malley"); 54 | $this->object->chopWithRegex("/\ '[^']+' /", 0); 55 | $this->assertEquals( 56 | "Björn O'Malley", 57 | $this->object->getStr() 58 | ); 59 | } 60 | 61 | public function testFlip() { 62 | $this->object->setStr("O'Malley, Björn"); 63 | $this->object->flip(","); 64 | $this->assertEquals( 65 | "Björn O'Malley", 66 | $this->object->getStr() 67 | ); 68 | } 69 | 70 | 71 | 72 | 73 | } 74 | ?> 75 | -------------------------------------------------------------------------------- /Tests/testNames.txt: -------------------------------------------------------------------------------- 1 | Björn O'Malley||Björn|||O'Malley| 2 | Bin Lin||Bin|||Lin| 3 | Linda Jones||Linda|||Jones| 4 | Jason H. Priem||Jason||H.|Priem| 5 | Björn O'Malley-Muñoz||Björn|||O'Malley-Muñoz| 6 | Björn C. O'Malley||Björn||C.|O'Malley| 7 | Björn "Bill" O'Malley||Björn|Bill||O'Malley| 8 | Björn ("Bill") O'Malley||Björn|Bill||O'Malley| 9 | Björn ("Wild Bill") O'Malley||Björn|Wild Bill||O'Malley| 10 | Björn (Bill) O'Malley||Björn|Bill||O'Malley| 11 | Björn 'Bill' O'Malley||Björn|Bill||O'Malley| 12 | Björn C O'Malley||Björn||C|O'Malley| 13 | Björn C. R. O'Malley||Björn||C. R.|O'Malley| 14 | Björn Charles O'Malley||Björn||Charles|O'Malley| 15 | Björn Charles R. O'Malley||Björn||Charles R.|O'Malley| 16 | Björn van O'Malley||Björn|||van O'Malley| 17 | Björn Charles van der O'Malley||Björn||Charles|van der O'Malley| 18 | Björn Charles O'Malley y Muñoz||Björn||Charles|O'Malley y Muñoz| 19 | Björn O'Malley, Jr.||Björn|||O'Malley|Jr. 20 | Björn O'Malley Jr||Björn|||O'Malley|Jr 21 | B O'Malley||B|||O'Malley| 22 | William Carlos Williams||William||Carlos|Williams| 23 | C. Björn Roger O'Malley|C.|Björn||Roger|O'Malley| 24 | B. C. O'Malley||B.||C.|O'Malley| 25 | B C O'Malley||B||C|O'Malley| 26 | B.J. Thomas||B.J.|||Thomas| 27 | O'Malley, Björn||Björn|||O'Malley| 28 | O'Malley, Björn Jr||Björn|||O'Malley|Jr 29 | O'Malley, C. Björn|C.|Björn|||O'Malley| 30 | O'Malley, C. Björn III|C.|Björn|||O'Malley|III 31 | O'Malley y Muñoz, C. Björn Roger III|C.|Björn||Roger|O'Malley y Muñoz|III -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jasonpriem/human-name-parser", 3 | "description": "Takes human names of arbitrary complexity and various wacky formats and parses them out.", 4 | "autoload": { 5 | "classmap": ["Name.php", "Parser.php"] 6 | } 7 | } -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | HumanNameParser demo 6 | 15 | 16 | 17 |

HumanNameParser test page

18 |
19 |

20 | This page uses the test names included in testNames.txt. See 21 | README.md file included for more details. Names 22 | are listed as: 23 |

24 |
    25 |
  1. Leading initial
  2. 26 |
  3. First name
  4. 27 |
  5. Nicknames
  6. 28 |
  7. Middle names
  8. 29 |
  9. Last names (surnames)
  10. 30 |
  11. Suffixes (like "Jr.")
  12. 31 |
32 |
33 | 34 | 35 | $expected = $actual"; 49 | } 50 | else { 51 | $ret = "$expected ≠ $actual"; 52 | } 53 | return $ret; 54 | } 55 | 56 | $handle = fopen('./Tests/testNames.txt', 'r'); 57 | while ($nameArr = fgetcsv($handle, 1000, "|")){ 58 | $parser = new HumanNameParser_Parser($nameArr[0]); 59 | 60 | // check to see if the parser got each name-part correct 61 | $ret = testEqual($nameArr[1], $parser->getleadingInit() ); 62 | $ret .= testEqual($nameArr[2], $parser->getFirst() ); 63 | $ret .= testEqual($nameArr[3], $parser->getNicknames() ); 64 | $ret .= testEqual($nameArr[4], $parser->getMiddle() ); 65 | $ret .= testEqual($nameArr[5], $parser->getLast() ); 66 | $ret .= testEqual($nameArr[6], $parser->getSuffix() ); 67 | $divClass = (strpos($ret, "class='fail'")) ? "fail" : "win"; // a hacky way to do this 68 | echo "

{$nameArr[0]}

$ret
"; 69 | } 70 | ?> 71 | 72 | 73 | -------------------------------------------------------------------------------- /init.php: -------------------------------------------------------------------------------- 1 | 5 | -------------------------------------------------------------------------------- /nbproject/private/private.properties: -------------------------------------------------------------------------------- 1 | copy.src.files=false 2 | copy.src.target= 3 | index.file=index.php 4 | run.as=LOCAL 5 | url=http://localhost/HumanNameParser/ 6 | -------------------------------------------------------------------------------- /nbproject/private/private.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /nbproject/project.properties: -------------------------------------------------------------------------------- 1 | include.path=${php.global.include.path} 2 | php.version=PHP_5 3 | source.encoding=UTF-8 4 | src.dir=. 5 | tags.asp=false 6 | tags.short=true 7 | test.src.dir=Tests 8 | web.root=. 9 | -------------------------------------------------------------------------------- /nbproject/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.netbeans.modules.php.project 4 | 5 | 6 | HumanNameParser 7 | 8 | 9 | 10 | --------------------------------------------------------------------------------