├── 2.png ├── README.md ├── dictdemo.php └── dict.class.php /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/widuu/baidu_dict/HEAD/2.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ##百度词典查词采集器(PHP版本) 3 | 4 |  5 | 6 | 自己写的百度词典dict.baidu.com 采集翻译用的,你自己也可以自己定制,其中包含几个文件: 7 | 8 | 1. word_data.php 13.5W单词库 9 | 10 | 2. dict.class.php 采集类 11 | 12 | 3. dictdemo.php 简单的采集案例 13 | 14 | ###使用方法 15 | 16 | 只要把文件放到你的指定的目录下即可,然后运行dictdemo.php可以查看效果,入库的流程请自己编写 17 | 18 | `dict.class.php`中有`array2string()`的方法,来把数组转化成字符串方便入库。 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /dictdemo.php: -------------------------------------------------------------------------------- 1 | $val) $string[$key] = new_stripslashes($val); 31 | return $string; 32 | } 33 | 34 | $dbname = "dict_word"; 35 | $hostname = "localhost"; 36 | $username = "root"; 37 | $password = "dgj99349"; 38 | $conn = mysql_connect($hostname,$username,$password); 39 | mysql_select_db($dbname,$conn); 40 | mysql_query("set names utf8"); 41 | $url = "http://www.test.com/"; //你放dictdemo.php的网站地址 42 | ignore_user_abort(); 43 | set_time_limit(0); 44 | $filename = "./dict_num.txt"; 45 | include("dict.class.php"); 46 | $data = include("word_data.php"); 47 | $dict = new dict(); 48 | if (isset($_GET["num"])){ 49 | $key = intval($_GET["num"]); 50 | $word = $data[$key]; 51 | $result = $dict -> content($word); 52 | if(empty($result["symbol"]["en"])){ 53 | $result = file_put_contents("noword.txt", $word."\n", FILE_APPEND); //有的个性的单词啥都没有 写入文件中 54 | }else{ 55 | $symbol = array2string(array_filter($result['symbol'])); 56 | $pro = array2string(array_filter($result['pro'])); 57 | $example = array2string(array_filter($result['example'])); 58 | $explain = array2string(array_filter($result['explain'])); 59 | $synonym = array2string(array_filter($result['synonym'])); 60 | $phrase = array2string(array_filter($result['phrase'])); 61 | echo "
"; 62 | print_r($symbol); 63 | print_r($pro); 64 | print_r($example); 65 | print_r($explain); 66 | print_r($synonym); 67 | print_r($phrase); 68 | echo ""; 69 | //mysql_query("insert into dict_word (`word`,`symbol`,`pro`,`example`,`explain`,`synonym`,`phrase`) values ('{$word}')") 70 | 71 | //your 逻辑采集入库 72 | } 73 | $num = $key+1; 74 | file_put_contents($filename,$num); 75 | } 76 | 77 | if(file_exists($filename)){ 78 | $key = file_get_contents($filename); 79 | }else{ 80 | $fp =fopen("$filename", "w+"); 81 | } 82 | 83 | $key = empty($key) ? 0 : intval($key); 84 | 85 | echo ""; 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /dict.class.php: -------------------------------------------------------------------------------- 1 | 音标 28 | * "pro" => 发音 29 | * "example"=> 例句 30 | * "explain"=> 简明释义 31 | * "synonym"=> 同反义词 32 | * "phrase" => 短语数组 33 | * ) 34 | * 35 | */ 36 | 37 | public function content($word){ 38 | $this -> word = $word; 39 | $symbol = $this -> Pronounced(); 40 | $pro = $this->getSay(); 41 | $example = $this -> getExample(); 42 | $explain = $this -> getExplain(); 43 | $synonym = $this -> getSynonym(); 44 | $phrase = $this -> getPhrase(); 45 | $result = array( 46 | "symbol" => $symbol, //音标 47 | "pro" => $pro, //发音 48 | "example"=> $example, //例句 49 | "explain"=> $explain, //简明释义 50 | "synonym"=> $synonym, //同反义词 51 | "phrase" => $phrase //短语数组 52 | ); 53 | return $result; 54 | } 55 | 56 | 57 | /** 58 | * 远程获取百度翻译内容 59 | * get function curl 60 | * retun string 61 | * 62 | */ 63 | 64 | private function getContent(){ 65 | $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; 66 | $ch = curl_init(); 67 | $url = "http://dict.baidu.com/s?wd=".$this->word; 68 | curl_setopt($ch, CURLOPT_URL, $url); 69 | curl_setopt($ch, CURLOPT_USERAGENT,$useragent); 70 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 71 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 72 | curl_setopt($ch, CURLOPT_HTTPGET, 1); 73 | curl_setopt($ch, CURLOPT_AUTOREFERER,1); 74 | curl_setopt($ch, CURLOPT_HEADER, 0); 75 | curl_setopt($ch, CURLOPT_TIMEOUT, 30); 76 | $result = curl_exec($ch); 77 | if (curl_errno($curl)) { 78 | echo 'Errno'.curl_error($curl); 79 | } 80 | curl_close($ch); 81 | return $result; 82 | } 83 | 84 | 85 | /** 86 | * 获取百度翻译发音 87 | * retun array(英,美) 88 | * 89 | */ 90 | 91 | private function Pronounced(){ 92 | $data = $this -> getContent(); 93 | preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced); 94 | return array( 95 | 'en' => $pronounced[1][0], 96 | 'us' => $pronounced[1][1] 97 | ); 98 | } 99 | 100 | /** 101 | * 获取百度翻译发音 102 | * return array(英,美) 103 | * 104 | */ 105 | 106 | private function getSay(){ 107 | $data = $this -> getContent(); 108 | preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced); 109 | return array( 110 | 'en' => $pronounced[1][0], 111 | 'us' => $pronounced[1][1] 112 | ); 113 | } 114 | 115 | /** 116 | * 获取百度翻译例句 117 | * return array() 多维数组 例句 118 | * 119 | */ 120 | 121 | private function getExample(){ 122 | $str = ""; 123 | $data = $this -> getContent(); 124 | preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example); 125 | $data1 = "[[[".ltrim($example[1][0],"["); 126 | $data2 = explode("[[[",$data1); 127 | $num = count(array_filter($data2)); 128 | foreach($data2 as $key => $value){ 129 | $data3 = explode("[[","[[".$value); 130 | foreach ($data3 as $k => $v) { 131 | preg_match_all("/\[\"(.*)\",/Us","[".$v, $match); 132 | if(!empty($match[1])){ 133 | $str .= implode($match[1]," ")."@"; 134 | } 135 | } 136 | } 137 | $data4 = trim($str,"@"); 138 | $data5 = explode("@", $data4); 139 | $result = array_chunk($data5, 2); 140 | return $result; 141 | } 142 | 143 | /** 144 | * 获取简明释义 145 | * return array (x => "词性",b => "附属") 146 | * 147 | **/ 148 | 149 | private function getExplain(){ 150 | $data = $this -> getContent(); 151 | preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\