.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data); 209 | foreach ($v_data['title'] as $m => $d) { 210 | $data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m])); 211 | $result[$key][$value["adj"][$k]][$d] = $data; 212 | } 213 | } 214 | } 215 | } 216 | return $result; 217 | } 218 | 219 | /** 220 | * 获取柯林斯高阶英汉词典 221 | * return string 222 | * 223 | */ 224 | 225 | // private function getCollins(){ 226 | // $data = $this -> getContent(); 227 | // preg_match_all("/id\=\"en\-collins\"\>(.*)\<div class\=\"source\"\>/Us",$data,$collins); 228 | // return strip_tags($collins[1][0]); 229 | // } 230 | 231 | /** 232 | * 获取短语词组 233 | * return array (key => value) 一维或者多维数组 234 | * 235 | */ 236 | 237 | private function getPhrase(){ 238 | $num = self::$num; 239 | $data = $this -> getContent(); 240 | preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase); 241 | $data = explode("</dd>",$phrase[1][0]); 242 | $data1 = array_slice($data,0,$num); 243 | $result = array(); 244 | foreach ($data1 as $key => $value) { 245 | $data2 = explode("</p>", $value); 246 | $n = count($data2); 247 | if($n<=3){ 248 | $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]); 249 | }else{ 250 | $data3 = array_slice($data2,0,$n-1); 251 | $data4 = array_slice($data2,0,2); 252 | $res = array_diff($data3,$data4); 253 | $data5 = array_chunk($res,2); 254 | $key_value = trim(str_replace(" ","",strip_tags($data4[0]))); 255 | $result[$key_value] = strip_tags($data4[1]); 256 | foreach ($data5 as $key => $value) { 257 | foreach ($value as $k => $v) { 258 | $value[$k] = strip_tags($v); 259 | } 260 | $array = array($result[$key_value],$value); 261 | if (array_key_exists($key_value, $result)){ 262 | $result[$key_value] = $array; 263 | } 264 | } 265 | 266 | } 267 | } 268 | return $result; 269 | } 270 | 271 | /** 272 | * 将数组转换为字符串 273 | * 274 | * @param array $data 数组 275 | * @param bool $isformdata 如果为0，则不使用new_stripslashes处理，可选参数，默认为1 276 | * @return string 返回字符串，如果，data为空，则返回空 277 | */ 278 | private function array2string($data, $isformdata = 1) { 279 | if($data == '') return ''; 280 | if($isformdata) $data = $this->new_stripslashes($data); 281 | return addslashes(var_export($data, TRUE)); 282 | } 283 | 284 | /** 285 | * 返回经stripslashes处理过的字符串或数组 286 | * @param $string 需要处理的字符串或数组 287 | * @return mixed 288 | */ 289 | private function new_stripslashes($string) { 290 | if(!is_array($string)) return stripslashes($string); 291 | foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); 292 | return $string; 293 | } 294 | 295 | } 296 | 297 | // $word = new dict("express"); 298 | // $word ->content(); --------------------------------------------------------------------------------

├── 2.png ├── README.md ├── dictdemo.php └── dict.class.php /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/widuu/baidu_dict/HEAD/2.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ##百度词典查词采集器（PHP版本） 3 | 4 | ![采集样本](https://raw.github.com/widuu/baidu_dict/master/2.png) 5 | 6 | 自己写的百度词典dict.baidu.com 采集翻译用的，你自己也可以自己定制，其中包含几个文件： 7 | 8 | 1. word_data.php 13.5W单词库 9 | 10 | 2. dict.class.php 采集类 11 | 12 | 3. dictdemo.php 简单的采集案例 13 | 14 | ###使用方法 15 | 16 | 只要把文件放到你的指定的目录下即可，然后运行dictdemo.php可以查看效果，入库的流程请自己编写 17 | 18 | `dict.class.php`中有`array2string()`的方法，来把数组转化成字符串方便入库。 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /dictdemo.php: -------------------------------------------------------------------------------- 1 | $val) $string[$key] = new_stripslashes($val); 31 | return $string; 32 | } 33 | 34 | $dbname = "dict_word"; 35 | $hostname = "localhost"; 36 | $username = "root"; 37 | $password = "dgj99349"; 38 | $conn = mysql_connect($hostname,$username,$password); 39 | mysql_select_db($dbname,$conn); 40 | mysql_query("set names utf8"); 41 | $url = "http://www.test.com/"; //你放dictdemo.php的网站地址 42 | ignore_user_abort(); 43 | set_time_limit(0); 44 | $filename = "./dict_num.txt"; 45 | include("dict.class.php"); 46 | $data = include("word_data.php"); 47 | $dict = new dict(); 48 | if (isset($_GET["num"])){ 49 | $key = intval($_GET["num"]); 50 | $word = $data[$key]; 51 | $result = $dict -> content($word); 52 | if(empty($result["symbol"]["en"])){ 53 | $result = file_put_contents("noword.txt", $word."\n", FILE_APPEND); //有的个性的单词啥都没有写入文件中 54 | }else{ 55 | $symbol = array2string(array_filter($result['symbol'])); 56 | $pro = array2string(array_filter($result['pro'])); 57 | $example = array2string(array_filter($result['example'])); 58 | $explain = array2string(array_filter($result['explain'])); 59 | $synonym = array2string(array_filter($result['synonym'])); 60 | $phrase = array2string(array_filter($result['phrase'])); 61 | echo "

";
62 | 		 	print_r($symbol);
63 | 		 	print_r($pro);
64 | 		 	print_r($example);
65 | 		 	print_r($explain);
66 | 		 	print_r($synonym);
67 | 		 	print_r($phrase);
68 | 		 	echo "

"; 69 | //mysql_query("insert into dict_word (`word`,`symbol`,`pro`,`example`,`explain`,`synonym`,`phrase`) values ('{$word}')") 70 | 71 | //your 逻辑采集入库 72 | } 73 | $num = $key+1; 74 | file_put_contents($filename,$num); 75 | } 76 | 77 | if(file_exists($filename)){ 78 | $key = file_get_contents($filename); 79 | }else{ 80 | $fp =fopen("$filename", "w+"); 81 | } 82 | 83 | $key = empty($key) ? 0 : intval($key); 84 | 85 | echo ""; 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /dict.class.php: -------------------------------------------------------------------------------- 1 | 音标 28 | * "pro" => 发音 29 | * "example"=> 例句 30 | * "explain"=> 简明释义 31 | * "synonym"=> 同反义词 32 | * "phrase" => 短语数组 33 | * ) 34 | * 35 | */ 36 | 37 | public function content($word){ 38 | $this -> word = $word; 39 | $symbol = $this -> Pronounced(); 40 | $pro = $this->getSay(); 41 | $example = $this -> getExample(); 42 | $explain = $this -> getExplain(); 43 | $synonym = $this -> getSynonym(); 44 | $phrase = $this -> getPhrase(); 45 | $result = array( 46 | "symbol" => $symbol, //音标 47 | "pro" => $pro, //发音 48 | "example"=> $example, //例句 49 | "explain"=> $explain, //简明释义 50 | "synonym"=> $synonym, //同反义词 51 | "phrase" => $phrase //短语数组 52 | ); 53 | return $result; 54 | } 55 | 56 | 57 | /** 58 | * 远程获取百度翻译内容 59 | * get function curl 60 | * retun string 61 | * 62 | */ 63 | 64 | private function getContent(){ 65 | $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; 66 | $ch = curl_init(); 67 | $url = "http://dict.baidu.com/s?wd=".$this->word; 68 | curl_setopt($ch, CURLOPT_URL, $url); 69 | curl_setopt($ch, CURLOPT_USERAGENT,$useragent); 70 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 71 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 72 | curl_setopt($ch, CURLOPT_HTTPGET, 1); 73 | curl_setopt($ch, CURLOPT_AUTOREFERER,1); 74 | curl_setopt($ch, CURLOPT_HEADER, 0); 75 | curl_setopt($ch, CURLOPT_TIMEOUT, 30); 76 | $result = curl_exec($ch); 77 | if (curl_errno($curl)) { 78 | echo 'Errno'.curl_error($curl); 79 | } 80 | curl_close($ch); 81 | return $result; 82 | } 83 | 84 | 85 | /** 86 | * 获取百度翻译发音 87 | * retun array(英，美) 88 | * 89 | */ 90 | 91 | private function Pronounced(){ 92 | $data = $this -> getContent(); 93 | preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced); 94 | return array( 95 | 'en' => $pronounced[1][0], 96 | 'us' => $pronounced[1][1] 97 | ); 98 | } 99 | 100 | /** 101 | * 获取百度翻译发音 102 | * return array(英，美) 103 | * 104 | */ 105 | 106 | private function getSay(){ 107 | $data = $this -> getContent(); 108 | preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced); 109 | return array( 110 | 'en' => $pronounced[1][0], 111 | 'us' => $pronounced[1][1] 112 | ); 113 | } 114 | 115 | /** 116 | * 获取百度翻译例句 117 | * return array() 多维数组例句 118 | * 119 | */ 120 | 121 | private function getExample(){ 122 | $str = ""; 123 | $data = $this -> getContent(); 124 | preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example); 125 | $data1 = "[[[".ltrim($example[1][0],"["); 126 | $data2 = explode("[[[",$data1); 127 | $num = count(array_filter($data2)); 128 | foreach($data2 as $key => $value){ 129 | $data3 = explode("[[","[[".$value); 130 | foreach ($data3 as $k => $v) { 131 | preg_match_all("/\[\"(.*)\",/Us","[".$v, $match); 132 | if(!empty($match[1])){ 133 | $str .= implode($match[1]," ")."@"; 134 | } 135 | } 136 | } 137 | $data4 = trim($str,"@"); 138 | $data5 = explode("@", $data4); 139 | $result = array_chunk($data5, 2); 140 | return $result; 141 | } 142 | 143 | /** 144 | * 获取简明释义 145 | * return array (x => "词性"，b => "附属") 146 | * 147 | **/ 148 | 149 | private function getExplain(){ 150 | $data = $this -> getContent(); 151 | preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain); 152 | $r_data = $explain[1][0]; 153 | preg_match_all("/\\(?P.*)\<\/strong\>\(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data); 154 | preg_match_all("/\(?P[^\>]+)\：\(?P.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data); 155 | 156 | $result = array(); 157 | foreach ($a_data["adj"] as $key => $value) { 158 | $result[$value] = $a_data["name"][$key]; 159 | } 160 | 161 | $word_b = array(); 162 | foreach ($b_data["tag"] as $key => $value) { 163 | $word_b[$value] = strip_tags($b_data["word"][$key]); 164 | } 165 | 166 | $result_data = array("x" => $result,"b" => $word_b); 167 | 168 | return $result_data; 169 | } 170 | 171 | 172 | 173 | 174 | /** 175 | * 获取百科释义 176 | * return string 177 | * 178 | */ 179 | 180 | // private function getBaike(){ 181 | // $data = $this -> getContent(); 182 | // preg_match_all("/id\=\"en\-baike\-mean\"\>(.*)<\/div>/Us",$data,$baike); 183 | // return strip_tags($baike[1][0]); 184 | // } 185 | 186 | /** 187 | * 获取同义词 188 | * return array(0 => "同义词", 1 => "反义词") 一般为多维数组 189 | * 190 | */ 191 | 192 | private function getSynonym(){ 193 | $data = $this -> getContent(); 194 | preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym); 195 | $content = $synonym[1][0]; 196 | $data1 = explode("", $content); 197 | $result = array(); 198 | $data2 = array(); 199 | foreach ($data1 as $key => $value) { 200 | preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\/div\>\\(?.*)\<\/ul\>/Us", $value, $r_data); 201 | $data2[$key]["adj"] = $r_data["adj"]; 202 | $data2[$key]["content"] = $r_data["content"]; 203 | } 204 | 205 | foreach ($data2 as $key => $value) { 206 | foreach ($value["content"] as $k => $v) { 207 | if(!empty($v)){ 208 | preg_match_all("/\\(?P.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data); 209 | foreach ($v_data['title'] as $m => $d) { 210 | $data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m])); 211 | $result[$key][$value["adj"][$k]][$d] = $data; 212 | } 213 | } 214 | } 215 | } 216 | return $result; 217 | } 218 | 219 | /** 220 | * 获取柯林斯高阶英汉词典 221 | * return string 222 | * 223 | */ 224 | 225 | // private function getCollins(){ 226 | // $data = $this -> getContent(); 227 | // preg_match_all("/id\=\"en\-collins\"\>(.*)\<div class\=\"source\"\>/Us",$data,$collins); 228 | // return strip_tags($collins[1][0]); 229 | // } 230 | 231 | /** 232 | * 获取短语词组 233 | * return array (key => value) 一维或者多维数组 234 | * 235 | */ 236 | 237 | private function getPhrase(){ 238 | $num = self::$num; 239 | $data = $this -> getContent(); 240 | preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase); 241 | $data = explode("</dd>",$phrase[1][0]); 242 | $data1 = array_slice($data,0,$num); 243 | $result = array(); 244 | foreach ($data1 as $key => $value) { 245 | $data2 = explode("</p>", $value); 246 | $n = count($data2); 247 | if($n<=3){ 248 | $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]); 249 | }else{ 250 | $data3 = array_slice($data2,0,$n-1); 251 | $data4 = array_slice($data2,0,2); 252 | $res = array_diff($data3,$data4); 253 | $data5 = array_chunk($res,2); 254 | $key_value = trim(str_replace(" ","",strip_tags($data4[0]))); 255 | $result[$key_value] = strip_tags($data4[1]); 256 | foreach ($data5 as $key => $value) { 257 | foreach ($value as $k => $v) { 258 | $value[$k] = strip_tags($v); 259 | } 260 | $array = array($result[$key_value],$value); 261 | if (array_key_exists($key_value, $result)){ 262 | $result[$key_value] = $array; 263 | } 264 | } 265 | 266 | } 267 | } 268 | return $result; 269 | } 270 | 271 | /** 272 | * 将数组转换为字符串 273 | * 274 | * @param array $data 数组 275 | * @param bool $isformdata 如果为0，则不使用new_stripslashes处理，可选参数，默认为1 276 | * @return string 返回字符串，如果，data为空，则返回空 277 | */ 278 | private function array2string($data, $isformdata = 1) { 279 | if($data == '') return ''; 280 | if($isformdata) $data = $this->new_stripslashes($data); 281 | return addslashes(var_export($data, TRUE)); 282 | } 283 | 284 | /** 285 | * 返回经stripslashes处理过的字符串或数组 286 | * @param $string 需要处理的字符串或数组 287 | * @return mixed 288 | */ 289 | private function new_stripslashes($string) { 290 | if(!is_array($string)) return stripslashes($string); 291 | foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); 292 | return $string; 293 | } 294 | 295 | } 296 | 297 | // $word = new dict("express"); 298 | // $word ->content(); --------------------------------------------------------------------------------