├── _config.yml ├── .gitignore ├── proxy.json ├── index.php ├── README.md └── class ├── sahibinden.class.php └── simple_html_dom.php /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | /.idea/ 4 | _config.yml 5 | -------------------------------------------------------------------------------- /proxy.json: -------------------------------------------------------------------------------- 1 | [ 2 | "193.255.1.102:8080", 3 | "178.250.90.18:8080", 4 | "88.255.225.58:8080" 5 | ] -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | Kategori(); 18 | //echo $Sahibinden->Kategori("xml","emlak",true); //Emlak Kategorisindekiler proxy kullanara xml formatında döndürür 19 | 20 | //Alt Kategoriler 21 | //@return xml,json,array 22 | 23 | //echo $Sahibinden->Kategori("json","ozel-ders-verenler"); 24 | //echo $Sahibinden->Kategori("json","kiralik"); 25 | 26 | 27 | //Listeler 28 | //@return xml,json,array 29 | $filters = array( 30 | //"date" => "1days", //1,3,7,15,30 //1 günlük ilanlar 31 | "address_city" => array("34"), //il plaka kodu 32 | "address_town" => array("451"), //ilçe kodu 33 | //"price_currency" => "1", //1=TL, 2=USD, 3=EUR, 4=GBP //para birimi 34 | //"price_min" => "5000", //minimum fiyat 35 | // "price_max" => "12000", //maximum fiyat 36 | // "hasVideo" => "false", //videolu ilanlar 37 | //"hasPhoto" => "true", //fotoğrafı olan ilanlar 38 | //"hasMegaPhoto" => "false", // büyük fotoğrafı olan ilanlar 39 | "sorting" => "price_asc" //sıralama price_asc, price_desc, date_asc, date_desc, address_desc, address_asc 40 | ); 41 | 42 | //print_r($Sahibinden->Liste('emlak',40,$filters,"array")); // Kiralık Ev Kategorisinden filtrelere uygun 40 kaydı array formatında döndürür 43 | echo $Sahibinden->Liste('kiralik-daire',100,$filters,'json'); //Emlak Kategorisinden 20 Kaydı JSON formatında döndürür. 44 | 45 | //İl ve İlçe Kodları (Filtreleme için) 46 | //@return xml,json,array 47 | //echo $Sahibinden->TownCodes(NULL, "xml"); //Tüm il ve ilçeleri XML formatında döndürür 48 | //echo $Sahibinden->TownCodes(34); // İstanbul ilçelerini JSON formatında döndürür 49 | 50 | 51 | 52 | //İlan Detayı 53 | 54 | //echo $Sahibinden->Detay("/ilan/vasita-otomobil-lotus-lotus-cars-turkey-elise-20th-edition-398612300/detay","json"); 55 | 56 | 57 | //Mağaza Bilgileri 58 | $stores = array("remaxpiramit", 59 | "vatanotomobil", 60 | "blackmotors"); 61 | //echo $Sahibinden->Magaza($stores); 62 | 63 | //Mağaza Kategorilerini Alt Kategoriyle birlikte 64 | //print_r($Sahibinden->MagazaKategori("remaxpiramit",NULL,"array")); 65 | //echo $Sahibinden->MagazaKategori("remaxpiramit",NULL,"json",true);// Mağaza Kategorilerini proxy ile json formatında getirir 66 | 67 | 68 | 69 | //Mağaza İlan Listesi 70 | $filters = array( 71 | "userId" => "57127" //Birden fazla seçilemez 72 | ); 73 | //echo $Sahibinden->MagazaListe("remaxpiramit",21); 74 | 75 | //Mağaza Danışman Listesi 76 | 77 | //echo $Sahibinden->MagazaDanismanlari("remaxpiramit","json"); 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Hakkında 2 | ==================== 3 | 4 | 5 | Sahibinden.com için @tayfunerbilen 'in eskiden hazırlamış olduğu bot'u güncel hale getirdim. Ve yeni özellikler eklemeye devam edeceğim. 6 | Şuan güzel bir şekilde; kategorileri, alt kategorileri, kategori listelerini ve detayları çekmektedir. 7 | 8 | 3 formattan dilediğinizi döndürebilirsiniz. 9 | * JSON 10 | * Array 11 | * XML 12 | 13 | Kullanımına aşağıdan bakabilirsiniz. 14 | 15 | * Default olarak json değer dönmektedir. 16 | * Proxy kullanımı istek sürenizi uzatabilir 17 | 18 | Kullanımı 19 | = 20 | 21 |

Tanımlama

22 | 23 | ```php 24 | 25 | require 'class/sahibinden.class.php'; 26 | $Sahibinden = new Sahibinden(); 27 | 28 | ``` 29 | 30 |

31 |

Ana Kategoriler

32 | @return xml,json,array 33 | 34 | ```php 35 | 36 | echo $Sahibinden->Kategori(); 37 | echo $Sahibinden->Kategori("xml","emlak",true); //Emlak Kategorisindekiler proxy kullanara xml formatında döndürür 38 | 39 | ``` 40 | 41 |

42 |

Alt Kategoriler

43 | @return xml,json,array 44 | 45 | 46 | ```php 47 | 48 | echo $Sahibinden->Kategori("json","ozel-ders-verenler"); 49 | echo $Sahibinden->Kategori("json","kiralik"); 50 | 51 | ``` 52 | 53 |

54 |

Listeler

55 | Sahibinden'de ilan içerisinde kullanılan tüm GET parametrelerini "filters" dizisine key=>value şeklinde ekleyerek filtremeleri yapabilirsiniz
56 | @return xml,json,array 57 | 58 | 59 | ```php 60 | 61 | $filters = array( 62 | "date" => "1days", //1,3,7,15,30 //1 günlük ilanlar 63 | "address_city" => "34", //il plaka kodu 64 | "address_town" => "71", //ilçe kodu 65 | "price_currency" => "1", //1=TL, 2=USD, 3=EUR, 4=GBP //para birimi 66 | "price_min" => "0", //minimum fiyat 67 | "price_max" => "12000", //maximum fiyat 68 | "hasVideo" => "false", //videolu ilanlar 69 | "hasPhoto" => "true", //fotoğrafı olan ilanlar 70 | "hasMegaPhoto" => "false", // büyük fotoğrafı olan ilanlar 71 | "sorting" => "price_asc" //sıralama price_asc, price_desc, date_asc, date_desc, address_desc, address_asc 72 | ); 73 | 74 | print_r($Sahibinden->Liste('kiralik',40,$filters,"array")); // Kiralık Ev Kategorisinden filtrelere uygun 40 kaydı array formatında döndürür 75 | echo $Sahibinden->Liste('emlak'); //Emlak Kategorisinden 20 Kaydı JSON formatında döndürür. 76 | 77 | ``` 78 | 79 |

80 |

İl ve İlçe Kodları (Filtreleme için)

81 | @return xml,json,array 82 | 83 | ```php 84 | 85 | echo $Sahibinden->TownCodes(NULL, "xml"); //Tüm il ve ilçeleri XML formatında döndürür 86 | echo $Sahibinden->TownCodes(34); // İstanbul ilçelerini JSON formatında döndürür 87 | 88 | ``` 89 |

90 |

İlan Detayı

91 | @return xml,json,array 92 | 93 | ```php 94 | 95 | echo $Sahibinden->Detay("/ilan/vasita-otomobil-lotus-lotus-cars-turkey-elise-20th-edition-398612300/detay","json"); 96 | 97 | ``` 98 | 99 |

100 | 101 | Mağazalar 102 | - 103 | 104 |

Mağaza Bilgileri

105 | 106 | ```php 107 | 108 | $stores = array("remaxpiramit", 109 | "vatanotomobil", 110 | "blackmotors"); 111 | echo $Sahibinden->Magaza($stores); 112 | 113 | ``` 114 | 115 |

Mağaza Kategorileri (Alt Kategorileri ile birlikte)

116 | @return xml,json,array 117 | 118 | ```php 119 | 120 | print_r($Sahibinden->MagazaKategori("remaxpiramit",NULL,"array")); 121 | echo $Sahibinden->MagazaKategori("remaxpiramit",NULL,"json",true);// Mağaza Kategorilerini proxy ile json formatında getirir 122 | 123 | ``` 124 | 125 |

126 |

Mağaza İlan Listesi

127 | @return xml,json,array 128 | 129 | ```php 130 | 131 | $filters = array( 132 | "userId" => "57127" 133 | ); 134 | echo $Sahibinden->MagazaListe("remaxpiramit",20,$filters); 135 | 136 | ``` 137 | 138 |

139 |

Mağaza Danışman Listesi

140 | @return xml,json,array 141 | 142 | ```php 143 | 144 | echo $Sahibinden->MagazaDanismanlari("remaxpiramit","json"); 145 | 146 | ``` 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /class/sahibinden.class.php: -------------------------------------------------------------------------------- 1 | data = array(); 30 | 31 | if (empty($url)) { 32 | if ($proxy == true) { 33 | $open = $this->Curl($this->baseUrl, true); 34 | } else { 35 | $open = $this->Curl($this->baseUrl); 36 | } 37 | if (!empty(@$open["error"])) { 38 | if (!str_get_html($open)->find("div.errorPage404")) { 39 | 40 | 41 | $items = str_get_html($open)->find("ul.categories-left-menu", 0)->find("li a[title]"); 42 | if (count($items) > 0) { 43 | foreach ($items as $element) { 44 | $this->data[] = array("title" => trim($element->plaintext), 45 | "uri" => trim($element->href), 46 | "url" => $this->baseUrl . trim($element->href) 47 | ); 48 | 49 | } 50 | } else { 51 | $this->data[] = array("error" => true, "url" => $this->baseUrl, "message" => "Sonuç Bulunamadı."); 52 | 53 | } 54 | 55 | } else { 56 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı."); 57 | } 58 | } else { 59 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]); 60 | } 61 | 62 | } else { 63 | $url = $this->baseUrl . '/kategori/' . $url; 64 | if ($proxy == true) { 65 | $open = $this->Curl($url, true); 66 | } else { 67 | $open = $this->Curl($url); 68 | } 69 | if (!empty(@$open["error"])) { 70 | if (!str_get_html($open)->find("div.errorPage404")) { 71 | 72 | 73 | $items = str_get_html($open)->find("ul.categoryList", 0)->find("li a"); 74 | if (count($items) > 0) { 75 | foreach ($items as $element) { 76 | $this->data[] = array("title" => trim($element->plaintext), 77 | "uri" => trim($element->href), 78 | "url" => $this->baseUrl . trim($element->href) 79 | ); 80 | } 81 | } else { 82 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı."); 83 | 84 | } 85 | } else { 86 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı."); 87 | } 88 | } else { 89 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]); 90 | } 91 | 92 | } 93 | 94 | 95 | return $this->ReturnWithTypes($type); 96 | 97 | } 98 | 99 | /** 100 | * Kategoriye ait ilanları listeler. 101 | * 102 | * @param string $kategoriLink 103 | * @param int $itemCount 104 | * @param array $filters 105 | * @param string $type 106 | * @param $proxy false|true 107 | * @return json,array,xml 108 | */ 109 | public function Liste($kategoriLink, $itemCount = 20, $filters = NULL, $type = "json", $proxy = false) 110 | { 111 | $this->data = array(); 112 | $filterText = ""; 113 | if (is_array($filters)) { 114 | foreach ($filters as $key => $val) { 115 | if (is_array($filters[$key])) { 116 | foreach ($filters[$key] as $v) { 117 | $filterText .= "&" . $key . "=" . $v; 118 | } 119 | } else { 120 | $filterText .= "&" . $key . "=" . $val; 121 | } 122 | 123 | } 124 | } 125 | 126 | 127 | if ($itemCount > 20) { 128 | $pageCount = ceil($itemCount / 20); 129 | } else { 130 | $pageCount = 1; 131 | } 132 | 133 | for ($p = 0; $p <= $pageCount - 1; $p++) { 134 | $page = $p * 20; 135 | 136 | $pageFilter = '?pagingOffset=' . $page; 137 | $url = $this->baseUrl . "/" . $kategoriLink . $pageFilter . $filterText; 138 | if ($proxy == true) { 139 | $open = $this->Curl($url, true); 140 | } else { 141 | $open = $this->Curl($url); 142 | } 143 | if (!empty(@$open["error"])) { 144 | if (!str_get_html($open)->find("div.errorPage404")) { 145 | 146 | 147 | $links = str_get_html($open)->find("td.searchResultsLargeThumbnail a"); 148 | $images = str_get_html($open)->find("td.searchResultsLargeThumbnail a img"); 149 | $prices = @str_get_html($open)->find("td.searchResultsPriceValue div"); 150 | $dates = str_get_html($open)->find("td.searchResultsDateValue"); 151 | $addresses = str_get_html($open)->find("td.searchResultsLocationValue"); 152 | $resultText = str_get_html($open)->find("div.infoSearchResults div.result-text", 0)->plaintext; 153 | $resultCount = str_get_html($open)->find("div.infoSearchResults div.result-text span", 1)->plaintext; 154 | 155 | foreach ($links as $link) { 156 | $linkArray[] = array("link" => $this->baseUrl . trim($link->href)); 157 | $uriArray[] = array("uri" => trim($link->href)); 158 | } 159 | foreach ($images as $image) { 160 | $thumbArray[] = array("thumb" => trim($image->src)); 161 | $imageArray[] = array("image" => str_replace("thmb_", "", trim($image->src))); 162 | $titleArray[] = array("title" => trim(explode("#", $image->alt)[0])); 163 | $idArray[] = array("id" => trim(explode("#", $image->alt)[1])); 164 | } 165 | foreach (@$prices as $price) { 166 | $priceArray[] = array("price" => trim($price->plaintext)); 167 | } 168 | foreach ($dates as $date) { 169 | $dateArray[] = array("date" => str_replace("
", "", str_replace("", "", str_replace("", "", trim($date->plaintext))))); 170 | } 171 | foreach ($addresses as $address) { 172 | $addressArray[] = array("address" => str_replace("
", "", trim($address->plaintext))); 173 | } 174 | 175 | 176 | } else { 177 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı."); 178 | } 179 | } else { 180 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]); 181 | } 182 | 183 | 184 | } 185 | 186 | if (count(@$linkArray) > 0) { 187 | $this->data["properties"] = array("count" => $itemCount, 188 | "resultText" => str_replace('"', "'", $resultText), 189 | "resultCount" => intval(str_replace(".", "", str_replace(' ilan ', "", $resultCount))), 190 | "filters" => $filters, 191 | "url" => str_replace("pagingOffset=" . $page, "", $url)); 192 | if(count($linkArray) < ($itemCount-1)){ 193 | for ($i = 0; $i <= count($linkArray)- 1; $i++) { 194 | $this->data["results"][] = @array_merge($idArray[$i], $linkArray[$i], $uriArray[$i], $titleArray[$i], $thumbArray[$i], $imageArray[$i], $priceArray[$i], $dateArray[$i], $addressArray[$i]); 195 | } 196 | }else{ 197 | for ($i = 0; $i <= $itemCount - 1; $i++) { 198 | $this->data["results"][] = @array_merge($idArray[$i], $linkArray[$i], $uriArray[$i], $titleArray[$i], $thumbArray[$i], $imageArray[$i], $priceArray[$i], $dateArray[$i], $addressArray[$i]); 199 | } 200 | } 201 | 202 | 203 | } else { 204 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı."); 205 | } 206 | 207 | return $this->ReturnWithTypes($type); 208 | 209 | } 210 | 211 | 212 | /** 213 | * İlan detaylarını listeler. 214 | * 215 | * @param null $url 216 | * @param json $type 217 | * @param $proxy false|true 218 | * @return JSON,XML,Array 219 | */ 220 | public function Detay($uri = NULL, $type = "json", $proxy = false) 221 | { 222 | $this->data = array(); 223 | $url = $this->baseUrl . $uri; 224 | if ($uri != NULL) { 225 | if ($proxy == true) { 226 | $open = $this->Curl($url, true); 227 | } else { 228 | $open = $this->Curl($url); 229 | } 230 | 231 | if (!empty(@$open["error"])) { 232 | if (!str_get_html($open)->find("div.errorPage404")) { 233 | 234 | $title = str_get_html($open)->find("div.classifiedDetailTitle h1", 0); 235 | $this->data = array( 236 | "url" => $url, 237 | "title" => $title->plaintext, 238 | "breadCrumb" => $this->getDetailBreadcrumb($open), 239 | "address" => $this->getDetailAddress($open), 240 | "price" => $this->getDetailPrice($open), 241 | "seller" => $this->getDetailSeller($open), 242 | "coordinates" => $this->getDetailCoordinates($open), 243 | "info" => $this->getDetailInfo($open), 244 | "properties" => $this->getDetailProperties($open), 245 | "description" => $this->getDetailDescription($open), 246 | "media" => $this->getDetailMedia($open) 247 | ); 248 | 249 | 250 | } else { 251 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı."); 252 | } 253 | } else { 254 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]); 255 | } 256 | 257 | 258 | } else { 259 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı."); 260 | } 261 | 262 | 263 | return $this->ReturnWithTypes($type); 264 | 265 | } 266 | 267 | 268 | /** 269 | * Mağazaya bilgilerini getirir 270 | * 271 | * @param string $storeName 272 | * @param array ,json,xml $type 273 | * @param $proxy false|true 274 | * @return json,array,xml 275 | * @return default = json 276 | */ 277 | public function Magaza($storeName, $type = "json", $proxy = false) 278 | { 279 | $this->data = array(); 280 | if (is_array($storeName)) { 281 | foreach ($storeName as $sn) { 282 | if (!empty($sn)) { 283 | $url = "https://" . $sn . $this->storeEndUrl; 284 | if ($proxy == true) { 285 | $open = $this->Curl($url, true); 286 | } else { 287 | $open = $this->Curl($url); 288 | } 289 | if (!empty(@$open["error"])) { 290 | if (!str_get_html($open)->find("div.errorPage404")) { 291 | $storeImage = str_get_html($open)->find("div.information-area div", 0)->find("a img", 0); 292 | $storePhone = str_get_html($open)->find("div.information-area div", 0)->find("p", 0); 293 | $storeCover = str_get_html($open)->find("div.theme", 0)->find("img", 0); 294 | $storeIlanSayisi = str_get_html($open)->find("div.classified-count", 0)->find("strong", 0); 295 | $storeAbout = str_get_html($open)->find("div.about", 0)->find("h2", 0); 296 | $this->data[] = array( 297 | "store_name" => $sn, 298 | "title" => trim($storeImage->alt), 299 | "about" => trim($storeAbout->plaintext), 300 | "profile-img" => $storeImage->src, 301 | "cover-img" => $storeCover->src, 302 | "phone" => trim(@$storePhone->plaintext), 303 | "ad-count" => intval(trim($storeIlanSayisi->plaintext)), 304 | ); 305 | } else { 306 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => "Sayfa Bulunamadı."); 307 | } 308 | } else { 309 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => $open["error"]); 310 | } 311 | } else { 312 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => "Mağaza adı boş olamaz"); 313 | } 314 | 315 | } 316 | 317 | 318 | } else { 319 | $this->data = array("error" => true, "store_name" => $storeName, "message" => "Mağaza ad(lar)ı dizi olarak giriniz."); 320 | } 321 | 322 | return $this->ReturnWithTypes($type); 323 | 324 | } 325 | 326 | 327 | /** 328 | * Mağazaya ait ana ve alt Kategorileri listelemek için kullanılır 329 | * 330 | * @param string $storeName 331 | * @param null $kategori 332 | * @param array ,json,xml $type 333 | * @param $proxy false|true 334 | * @return json,array,xml 335 | * @return default = json 336 | */ 337 | public function MagazaKategori($storeName, $kategori = NULL, $type = "json", $proxy = false) 338 | { 339 | if (!empty($storeName)) { 340 | if ($kategori == NULL) { 341 | $url = "https://" . $storeName . $this->storeEndUrl; 342 | } else { 343 | $url = "https://" . $storeName . $this->storeEndUrl . "/" . $kategori; 344 | } 345 | 346 | 347 | if ($proxy == true) { 348 | $open = $this->Curl($url, true); 349 | } else { 350 | $open = $this->Curl($url); 351 | } 352 | if (!empty(@$open["error"])) { 353 | if (!str_get_html($open)->find("div.errorPage404")) { 354 | 355 | for ($x = 0; $x <= 10; $x++) { 356 | $ul = str_get_html($open)->find("div.categories ul li.level" . $x); 357 | foreach ($ul as $u) { 358 | $categories = $u->find("a"); 359 | foreach ($categories as $c) { 360 | $uri = explode("?", str_replace("/", "", $c->href)); 361 | $cats = array( 362 | "title" => trim($c->plaintext), 363 | "uri" => $uri[0], 364 | "is_current_category" => $uri[0] == $kategori ? true : false, 365 | "url" => "https://" . $storeName . $this->storeEndUrl . "/" . $uri[0], 366 | "sub_categories" => NULL 367 | ); 368 | } 369 | $level = str_replace("level", "", $u->class); 370 | if ($level == 0) { 371 | $this->data[] = $cats; 372 | } else if ($level == 1) { 373 | $this->data[$x - $level]["sub_categories"][] = $cats; 374 | } else if ($level == 2) { 375 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats; 376 | } else if ($level == 3) { 377 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats; 378 | } else if ($level == 4) { 379 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats; 380 | } 381 | 382 | 383 | } 384 | } 385 | 386 | 387 | } else { 388 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı."); 389 | } 390 | } else { 391 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]); 392 | } 393 | 394 | } else { 395 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Mağaza adı giriniz."); 396 | } 397 | 398 | return $this->ReturnWithTypes($type); 399 | 400 | } 401 | 402 | 403 | /** 404 | * Mağazaya ait ilanları listeler. 405 | * 406 | * @param string $storeName 407 | * @param int $itemCount 408 | * @param array $filters 409 | * @param string $type 410 | * @param $proxy false|true 411 | * @return JSON,XML,Array 412 | */ 413 | public function MagazaListe($storeName, $itemCount = 20, $filters = NULL, $type = "json", $proxy = false) 414 | { 415 | 416 | $this->data = array(); 417 | $filterText = ""; 418 | if (is_array($filters)) { 419 | foreach ($filters as $key => $val) { 420 | if (is_array($filters[$key])) { 421 | foreach ($filters[$key] as $v) { 422 | $filterText .= "&" . $key . "=" . $v; 423 | } 424 | } else { 425 | $filterText .= "&" . $key . "=" . $val; 426 | } 427 | 428 | } 429 | } 430 | 431 | 432 | if ($itemCount > 20) { 433 | $pageCount = ceil($itemCount / 20); 434 | } else { 435 | $pageCount = 1; 436 | } 437 | $ic = 0; 438 | for ($p = 0; $p <= $pageCount - 1; $p++) { 439 | $page = $p * 20; 440 | $pageFilter = '?pagingOffset=' . $page; 441 | $url = "https://" . $storeName . $this->storeEndUrl . $pageFilter . $filterText; 442 | if ($proxy == true) { 443 | $open = $this->Curl($url, true); 444 | } else { 445 | $open = $this->Curl($url); 446 | } 447 | 448 | $columns = str_get_html($open)->find("div.classified-list table thead th"); 449 | $tr = str_get_html($open)->find("div.classified-list table tbody tr"); 450 | $colCount = count($columns); 451 | if (!empty(@$open["error"])) { 452 | if (!str_get_html($open)->find("div.errorPage404")) { 453 | 454 | if (count($tr) > 0) { 455 | 456 | for ($j = 1; $j <= count($tr) - 1; $j++) { 457 | if ($ic == $itemCount) { 458 | continue; 459 | } else { 460 | $d = array(); 461 | 462 | $href = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", 0)->find("a", 0); 463 | $img = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", 0)->find("a", 0)->find("img", 0); 464 | $baslik = explode("#", $img->alt); 465 | $d["id"] = $baslik[1]; 466 | $d["title"] = trim($baslik[0]); 467 | $d["link"] = $href->href; 468 | $d["image"] = $img->src; 469 | 470 | $imgExp = explode("/", $img->src); 471 | $thmb = "thmb_" . end($imgExp); 472 | array_pop($imgExp); 473 | array_push($imgExp, $thmb); 474 | $thumb = implode("/", $imgExp); 475 | $d["thumb"] = $thumb; 476 | 477 | for ($x = 0; $x <= $colCount - 1; $x++) { 478 | $row = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", $x); 479 | if (!empty(trim($columns[$x]->plaintext))) { 480 | $title = $this->turkishChars(strtolower(trim($columns[$x]->plaintext))); 481 | $d[$title] = trim($row->plaintext); 482 | } 483 | } 484 | 485 | 486 | $this->data[] = $d; 487 | $ic++; 488 | } 489 | } 490 | } else { 491 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı."); 492 | } 493 | 494 | } else { 495 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı."); 496 | } 497 | } else { 498 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]); 499 | } 500 | 501 | 502 | } 503 | 504 | 505 | return $this->ReturnWithTypes($type); 506 | 507 | } 508 | 509 | 510 | /** 511 | * Belirtilen mağazanın danışman listesini döndürür 512 | * 513 | * @param $store_name string 514 | * @param $type string 515 | * @param $proxy false|true 516 | * @return JSON,XML,Array 517 | */ 518 | public function MagazaDanismanlari($storeName, $type = "json", $proxy = false) 519 | { 520 | 521 | $this->data = array(); 522 | if (!empty($storeName)) { 523 | $url = "https://" . $storeName . $this->storeEndUrl; 524 | if ($proxy == true) { 525 | $open = $this->Curl($url, true); 526 | } else { 527 | $open = $this->Curl($url); 528 | } 529 | 530 | if (!empty(@$open["error"])) { 531 | if (!str_get_html($open)->find("div.errorPage404")) { 532 | 533 | 534 | $agentsLink = str_get_html($open)->find("div.oc-select-list ul li a"); 535 | $agentsName = str_get_html($open)->find("div.oc-select-list ul li a p"); 536 | $agentsImg = str_get_html($open)->find("div.oc-select-list ul li a img"); 537 | $agentsPhone = str_get_html($open)->find("div.oc-select-list ul li a span"); 538 | 539 | for ($a = 0; $a <= count($agentsLink) - 1; $a++) { 540 | $agentID = explode("userId=", $agentsLink[$a]->href); 541 | 542 | $this->data[] = array( 543 | "name" => trim($agentsName[$a]->plaintext), 544 | "userId" => $agentID[1], 545 | "image_200" => $agentsImg[$a]->src, 546 | "image_400" => str_replace("p200", "p400", $agentsImg[$a]->src), 547 | "phone" => trim($agentsPhone[$a]->plaintext) 548 | ); 549 | } 550 | 551 | } else { 552 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı."); 553 | } 554 | } else { 555 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]); 556 | } 557 | 558 | } else { 559 | $this->data = array("error" => true, "store_name" => $storeName, "message" => "Mağaza adı bulunamadı."); 560 | } 561 | 562 | return $this->ReturnWithTypes($type); 563 | 564 | } 565 | 566 | /** 567 | * Sahibinden.com Filtrelemelerine uygun il içe isimleri ve kodlarını döndürür 568 | * 569 | * @param $il //Plaka Kodu 570 | * @param $type //Dönecek veri formatı 571 | * @param $proxy false|true 572 | * @return JSON,XML,Array 573 | */ 574 | public function TownCodes($il = NULL, $type = "json") 575 | { 576 | /* ilce.html den gelen veri ilce.json a bu şekilde aktarıldı. 577 | $data = file_get_html("ilce.html")->find("li"); 578 | foreach ($data as $e) { 579 | $this->data[$e->attr["data-parentid"]][] = array( 580 | "il-id" => $e->attr["data-parentid"], 581 | "il-adi" => $e->attr["data-parentlabel"], 582 | "ilce-id" => $e->attr["data-id"], 583 | "ilce-adi" => $e->plaintext); 584 | } 585 | */ 586 | $ilceJson = json_decode(file_get_contents("ilce.json"), true); 587 | if ($type == "json") { 588 | if ($il != NULL) { 589 | return json_encode($ilceJson[$il]); 590 | } else { 591 | return json_encode($ilceJson); 592 | } 593 | } else if ($type == "array") { 594 | if ($il != NULL) { 595 | return $ilceJson[$il]; 596 | } else { 597 | return $ilceJson; 598 | } 599 | } else if ($type == "xml") { 600 | if ($il != NULL) { 601 | $xml = new SimpleXMLElement(''); 602 | $this->array_to_xml($ilceJson[$il], $xml); 603 | return $xml->asXML(); 604 | } else { 605 | $xml = new SimpleXMLElement(''); 606 | $this->array_to_xml($ilceJson, $xml); 607 | return $xml->asXML(); 608 | } 609 | } 610 | 611 | 612 | } 613 | 614 | 615 | /** 616 | * Detay methodu için ilanın video ve fotoğraflarını getirir 617 | * 618 | * @param $html //Detay sayfası html 619 | * @return Array 620 | */ 621 | private function getDetailMedia($html) 622 | { 623 | $images = str_get_html($html)->find("ul.classifiedDetailThumbListPages img"); 624 | $movies = str_get_html($html)->find("source#mp4"); 625 | 626 | $thumbArray = array(); 627 | $imageArray = array(); 628 | $megaArray = array(); 629 | $movieArray = array(); 630 | if (count($images) > 0) { 631 | foreach ($images as $img) { 632 | $thumbArray[] = $img->src; 633 | $imageArray[] = str_replace("thmb_", "", $img->src); 634 | $megaArray[] = str_replace("thmb_", "x16_", $img->src); 635 | 636 | } 637 | } 638 | if (count($movies) > 0) { 639 | foreach ($movies as $movie) { 640 | $movieArray[] = $movie->src; 641 | } 642 | } 643 | 644 | 645 | $return = array("thumb_images" => $thumbArray, 646 | "standart_images" => $imageArray, 647 | "mega_images" => $megaArray, 648 | "movies" => $movieArray); 649 | 650 | return $return; 651 | 652 | 653 | } 654 | 655 | /** 656 | * Detay methodu için ilanın özelliklerini getirir 657 | * 658 | * @param $html //Detay sayfası html 659 | * @return Array 660 | */ 661 | private function getDetailProperties($html) 662 | { 663 | $propertyTitles = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("h3"); 664 | $propertyCount = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("ul"); 665 | $propertyArray = array(); 666 | if (count($propertyCount) > 0) { 667 | 668 | for ($p = 0; $p <= count($propertyCount) - 1; $p++) { 669 | $propertyDetails = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("ul", $p)->find("li.selected"); 670 | $ppDetails = array(); 671 | if (count($propertyDetails) > 0) { 672 | for ($d = 0; $d <= count($propertyDetails) - 1; $d++) { 673 | $ppDetails[] = trim($propertyDetails[$d]->plaintext); 674 | } 675 | } 676 | 677 | $propertyArray[] = array(trim($propertyTitles[$p]->plaintext) => $ppDetails); 678 | } 679 | return $propertyArray; 680 | } else { 681 | return $propertyArray; 682 | 683 | } 684 | 685 | 686 | } 687 | 688 | /** 689 | * Detay methodu için ilanın breadcrumb'ını getirir 690 | * 691 | * @param $html //Detay sayfası html 692 | * @return Array 693 | */ 694 | private function getDetailBreadcrumb($html) 695 | { 696 | $breadCrumb = str_get_html($html)->find("div.classifiedBreadCrumb ul li"); 697 | $breadArray = array(); 698 | if (count($breadCrumb) > 0) { 699 | foreach ($breadCrumb as $bc) { 700 | $breadArray[] = trim($bc->plaintext); 701 | } 702 | } 703 | 704 | return $breadArray; 705 | 706 | 707 | } 708 | 709 | 710 | /** 711 | * Detay methodu için ilanın adresini getirir 712 | * 713 | * @param $html //Detay sayfası html 714 | * @return Array 715 | */ 716 | private function getDetailAddress($html) 717 | { 718 | $address = str_get_html($html)->find("div.classifiedInfo h2", 0)->find("a"); 719 | 720 | 721 | $return = array( 722 | "city" => trim($address[0]->plaintext), 723 | "town" => trim($address[1]->plaintext), 724 | "district" => trim($address[2]->plaintext) 725 | ); 726 | 727 | return $return; 728 | 729 | } 730 | 731 | 732 | /** 733 | * Detay methodu için ilanın koordinatlarını getirir 734 | * 735 | * @param $html //Detay sayfası html 736 | * @return Array 737 | */ 738 | private function getDetailCoordinates($html) 739 | { 740 | 741 | $map = str_get_html($html)->find("div#gmap", 0); 742 | $return = array( 743 | "latitude" => trim(@$map->attr["data-lat"]), 744 | "longitude" => trim(@$map->attr["data-lon"]) 745 | ); 746 | 747 | return $return; 748 | 749 | 750 | } 751 | 752 | 753 | /** 754 | * Detay methodu için ilanın fiyatını getirir 755 | * 756 | * @param $html //Detay sayfası html 757 | * @return Array 758 | */ 759 | private function getDetailPrice($html) 760 | { 761 | 762 | $price = str_get_html($html)->find("div.classifiedInfo h3", 0); 763 | $priceTrim = str_get_html($html)->find("div.classifiedInfo h3 a", 0); 764 | 765 | 766 | return trim(str_replace($priceTrim->plaintext, "", $price->plaintext)); 767 | 768 | } 769 | 770 | /** 771 | * Detay methodu için ilan sahibinin bilgilerini getirir 772 | * 773 | * @param $html //Detay sayfası html 774 | * @return Array 775 | */ 776 | private function getDetailSeller($html) 777 | { 778 | 779 | $sellerName = str_get_html($html)->find("div.classifiedUserContent h5", 0); 780 | $sellerStore = str_get_html($html)->find("a.userClassifieds", 0); 781 | $sellerImg = str_get_html($html)->find("div.classifiedUserContent a img", 0); 782 | $sellerPhoneFields = str_get_html($html)->find("ul#phoneInfoPart li strong"); 783 | $sellerPhoneText = str_get_html($html)->find("ul#phoneInfoPart li span.pretty-phone-part"); 784 | 785 | if (count($sellerPhoneFields) > 0) { 786 | for ($f = 0; $f <= count($sellerPhoneFields) - 1; $f++) { 787 | $phoneArray[] = array("title" => trim($sellerPhoneFields[$f]->plaintext), "text" => trim($sellerPhoneText[$f]->plaintext)); 788 | } 789 | } 790 | 791 | $return = array( 792 | "name" => trim(@$sellerName->plaintext), 793 | "store_link" => $sellerStore->href, 794 | "image" => @$sellerImg->src, 795 | "phones" => @$phoneArray 796 | ); 797 | 798 | return $return; 799 | 800 | } 801 | 802 | /** 803 | * Detay methodu için ilanın açıklamasını getirir 804 | * 805 | * @param $html //Detay sayfası html 806 | * @return Array 807 | */ 808 | private function getDetailDescription($html) 809 | { 810 | 811 | $description = str_get_html($html)->find("div.classifiedDescription", 0); 812 | 813 | $return = array("text" => trim($description->plaintext), 814 | "base64" => base64_encode($description->innertext)); 815 | 816 | return $return; 817 | } 818 | 819 | /** 820 | * Detay methodu için ilanın detaylarını getirir 821 | * 822 | * @param $html //Detay sayfası html 823 | * @return Array 824 | */ 825 | private function getDetailInfo($html) 826 | { 827 | 828 | $infoListFields = str_get_html($html)->find("ul.classifiedInfoList li strong"); 829 | $infoListTexts = str_get_html($html)->find("ul.classifiedInfoList li span"); 830 | $infoArray = array(); 831 | if (count($infoListFields) > 0) { 832 | for ($f = 0; $f <= count($infoListFields) - 1; $f++) { 833 | $infoArray[] = array("title" => trim($infoListFields[$f]->plaintext), "text" => trim($infoListTexts[$f]->plaintext)); 834 | } 835 | } 836 | 837 | return $infoArray; 838 | 839 | 840 | } 841 | 842 | /** 843 | * Array formatından XML veya JSON formatı oluşturur 844 | * 845 | * @param $type //Dönecek veri formatı 846 | * @return JSON,XML,Array 847 | */ 848 | private function ReturnWithTypes($type = "json") 849 | { 850 | 851 | if ($type == "json" or empty($type)) { 852 | return json_encode($this->data); 853 | } else if ($type == "array") { 854 | return $this->data; 855 | } else if ($type == "xml") { 856 | $xml = new SimpleXMLElement(''); 857 | $this->array_to_xml($this->data, $xml); 858 | return $xml->asXML(); 859 | } 860 | } 861 | 862 | 863 | /** 864 | * Türkçe Karakterleri İngilizce karaktere çevirir boşlukları "-" tireye çevirir 865 | * 866 | * @param $string 867 | * @return string 868 | */ 869 | private function turkishChars($s) 870 | { 871 | $tr = array('ş', 'Ş', 'ı', 'I', 'İ', 'ğ', 'Ğ', 'ü', 'Ü', 'ö', 'Ö', 'Ç', 'ç', '(', ')', '/', ':', ',', '&', '"', "“", "”"); 872 | $eng = array('s', 's', 'i', 'i', 'i', 'g', 'g', 'u', 'u', 'o', 'o', 'c', 'c', '', '', '-', '-', '', "", "", ""); 873 | $s = str_replace($tr, $eng, $s); 874 | $s = strtolower($s); 875 | $s = preg_replace('/&amp;amp;amp;amp;amp;amp;amp;amp;.+?;/', '', $s); 876 | $s = preg_replace('/\s+/', '-', $s); 877 | $s = preg_replace('|-+|', '-', $s); 878 | $s = preg_replace('/#/', '', $s); 879 | $s = trim($s, '-'); 880 | return $s; 881 | } 882 | 883 | /** 884 | * Gereksiz boşlukları temizler. 885 | * 886 | * @param $string 887 | * @return string 888 | */ 889 | private function replaceSpace($string) 890 | { 891 | $string = preg_replace("/\s+/", " ", $string); 892 | $string = trim($string); 893 | return $string; 894 | } 895 | 896 | /** 897 | * @param $url 898 | * @param null $proxy 899 | * @return mixed 900 | */ 901 | private function Curl($url, $proxy = false) 902 | { 903 | 904 | $options = array( 905 | CURLOPT_RETURNTRANSFER => true, 906 | CURLOPT_HEADER => false, 907 | CURLOPT_ENCODING => "", 908 | CURLOPT_AUTOREFERER => true, 909 | CURLOPT_FOLLOWLOCATION => true, 910 | CURLOPT_CONNECTTIMEOUT => 30, 911 | CURLOPT_TIMEOUT => 30, 912 | CURLOPT_MAXREDIRS => 10, 913 | CURLOPT_SSL_VERIFYPEER => false, 914 | 915 | ); 916 | if ($proxy == true) { 917 | $proxyList = json_decode(file_get_contents("proxy.json"),true); 918 | $p = rand(0, count($proxyList) - 1); 919 | array_push($options, array(CURLOPT_PROXY => $proxyList[$p])); 920 | } 921 | 922 | $ch = curl_init($url); 923 | curl_setopt_array($ch, $options); 924 | $content = curl_exec($ch); 925 | $err = curl_errno($ch); 926 | $errmsg = curl_error($ch); 927 | $header = curl_getinfo($ch); 928 | $redirectURL = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); 929 | curl_close($ch); 930 | $header['errno'] = $err; 931 | $header['errmsg'] = $errmsg; 932 | $header['redirect'] = $redirectURL; 933 | $header['content'] = $content; 934 | if (empty($errmsg)) { 935 | 936 | return str_replace(array("\n", "\r", "\t"), NULL, $header['content']); 937 | } else { 938 | return array("error" => "code: " . $err . " message:" . $errmsg); 939 | } 940 | } 941 | 942 | /** 943 | * PHP Array formatını XML formata çevirir 944 | * 945 | * @param $array 946 | * @return XML 947 | */ 948 | private function array_to_xml($array, &$xml_user_info) 949 | { 950 | foreach ($array as $key => $value) { 951 | if (is_array($value)) { 952 | if (!is_numeric($key)) { 953 | $subnode = $xml_user_info->addChild("$key"); 954 | $this->array_to_xml($value, $subnode); 955 | } else { 956 | $subnode = $xml_user_info->addChild("item"); 957 | $this->array_to_xml($value, $subnode); 958 | } 959 | } else { 960 | $xml_user_info->addChild("$key", htmlspecialchars("$value")); 961 | } 962 | } 963 | } 964 | 965 | 966 | } 967 | -------------------------------------------------------------------------------- /class/simple_html_dom.php: -------------------------------------------------------------------------------- 1 | size is the "real" number of bytes the dom was created from. 18 | * but for most purposes, it's a really good estimation. 19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 20 | * Allow the user to tell us how much they trust the html. 21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 22 | * This allows for us to find tags based on the text they contain. 23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 24 | * Paperg: added parse_charset so that we know about the character set of the source document. 25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 27 | * 28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. 30 | * 31 | * Licensed under The MIT License 32 | * Redistributions of files must retain the above copyright notice. 33 | * 34 | * @author S.C. Chen 35 | * @author John Schlick 36 | * @author Rus Carroll 37 | * @version 1.5 ($Rev: 210 $) 38 | * @package PlaceLocalInclude 39 | * @subpackage simple_html_dom 40 | */ 41 | 42 | /** 43 | * All of the Defines for the classes below. 44 | * @author S.C. Chen 45 | */ 46 | define('HDOM_TYPE_ELEMENT', 1); 47 | define('HDOM_TYPE_COMMENT', 2); 48 | define('HDOM_TYPE_TEXT', 3); 49 | define('HDOM_TYPE_ENDTAG', 4); 50 | define('HDOM_TYPE_ROOT', 5); 51 | define('HDOM_TYPE_UNKNOWN', 6); 52 | define('HDOM_QUOTE_DOUBLE', 0); 53 | define('HDOM_QUOTE_SINGLE', 1); 54 | define('HDOM_QUOTE_NO', 3); 55 | define('HDOM_INFO_BEGIN', 0); 56 | define('HDOM_INFO_END', 1); 57 | define('HDOM_INFO_QUOTE', 2); 58 | define('HDOM_INFO_SPACE', 3); 59 | define('HDOM_INFO_TEXT', 4); 60 | define('HDOM_INFO_INNER', 5); 61 | define('HDOM_INFO_OUTER', 6); 62 | define('HDOM_INFO_ENDSPACE',7); 63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 64 | define('DEFAULT_BR_TEXT', "\r\n"); 65 | define('DEFAULT_SPAN_TEXT', " "); 66 | define('MAX_FILE_SIZE', 600000); 67 | // helper functions 68 | // ----------------------------------------------------------------------------- 69 | // get html dom from file 70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 72 | { 73 | // We DO force the tags to be terminated. 74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. 76 | $contents = file_get_contents($url, $use_include_path, $context, $offset); 77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. 78 | //$contents = retrieve_url_contents($url); 79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) 80 | { 81 | return false; 82 | } 83 | // The second parameter can force the selectors to all be lowercase. 84 | $dom->load($contents, $lowercase, $stripRN); 85 | return $dom; 86 | } 87 | 88 | // get html dom from string 89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 90 | { 91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) 93 | { 94 | $dom->clear(); 95 | return false; 96 | } 97 | $dom->load($str, $lowercase, $stripRN); 98 | return $dom; 99 | } 100 | 101 | // dump html dom tree 102 | function dump_html_tree($node, $show_attr=true, $deep=0) 103 | { 104 | $node->dump($node); 105 | } 106 | 107 | 108 | /** 109 | * simple html dom node 110 | * PaperG - added ability for "find" routine to lowercase the value of the selector. 111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index 112 | * 113 | * @package PlaceLocalInclude 114 | */ 115 | class simple_html_dom_node 116 | { 117 | public $nodetype = HDOM_TYPE_TEXT; 118 | public $tag = 'text'; 119 | public $attr = array(); 120 | public $children = array(); 121 | public $nodes = array(); 122 | public $parent = null; 123 | // The "info" array - see HDOM_INFO_... for what each element contains. 124 | public $_ = array(); 125 | public $tag_start = 0; 126 | private $dom = null; 127 | 128 | function __construct($dom) 129 | { 130 | $this->dom = $dom; 131 | $dom->nodes[] = $this; 132 | } 133 | 134 | function __destruct() 135 | { 136 | $this->clear(); 137 | } 138 | 139 | function __toString() 140 | { 141 | return $this->outertext(); 142 | } 143 | 144 | // clean up memory due to php5 circular references memory leak... 145 | function clear() 146 | { 147 | $this->dom = null; 148 | $this->nodes = null; 149 | $this->parent = null; 150 | $this->children = null; 151 | } 152 | 153 | // dump node's tree 154 | function dump($show_attr=true, $deep=0) 155 | { 156 | $lead = str_repeat(' ', $deep); 157 | 158 | echo $lead.$this->tag; 159 | if ($show_attr && count($this->attr)>0) 160 | { 161 | echo '('; 162 | foreach ($this->attr as $k=>$v) 163 | echo "[$k]=>\"".$this->$k.'", '; 164 | echo ')'; 165 | } 166 | echo "\n"; 167 | 168 | if ($this->nodes) 169 | { 170 | foreach ($this->nodes as $c) 171 | { 172 | $c->dump($show_attr, $deep+1); 173 | } 174 | } 175 | } 176 | 177 | 178 | // Debugging function to dump a single dom node with a bunch of information about it. 179 | function dump_node($echo=true) 180 | { 181 | 182 | $string = $this->tag; 183 | if (count($this->attr)>0) 184 | { 185 | $string .= '('; 186 | foreach ($this->attr as $k=>$v) 187 | { 188 | $string .= "[$k]=>\"".$this->$k.'", '; 189 | } 190 | $string .= ')'; 191 | } 192 | if (count($this->_)>0) 193 | { 194 | $string .= ' $_ ('; 195 | foreach ($this->_ as $k=>$v) 196 | { 197 | if (is_array($v)) 198 | { 199 | $string .= "[$k]=>("; 200 | foreach ($v as $k2=>$v2) 201 | { 202 | $string .= "[$k2]=>\"".$v2.'", '; 203 | } 204 | $string .= ")"; 205 | } else { 206 | $string .= "[$k]=>\"".$v.'", '; 207 | } 208 | } 209 | $string .= ")"; 210 | } 211 | 212 | if (isset($this->text)) 213 | { 214 | $string .= " text: (" . $this->text . ")"; 215 | } 216 | 217 | $string .= " HDOM_INNER_INFO: '"; 218 | if (isset($node->_[HDOM_INFO_INNER])) 219 | { 220 | $string .= $node->_[HDOM_INFO_INNER] . "'"; 221 | } 222 | else 223 | { 224 | $string .= ' NULL '; 225 | } 226 | 227 | $string .= " children: " . count($this->children); 228 | $string .= " nodes: " . count($this->nodes); 229 | $string .= " tag_start: " . $this->tag_start; 230 | $string .= "\n"; 231 | 232 | if ($echo) 233 | { 234 | echo $string; 235 | return; 236 | } 237 | else 238 | { 239 | return $string; 240 | } 241 | } 242 | 243 | // returns the parent of node 244 | // If a node is passed in, it will reset the parent of the current node to that one. 245 | function parent($parent=null) 246 | { 247 | // I am SURE that this doesn't work properly. 248 | // It fails to unset the current node from it's current parents nodes or children list first. 249 | if ($parent !== null) 250 | { 251 | $this->parent = $parent; 252 | $this->parent->nodes[] = $this; 253 | $this->parent->children[] = $this; 254 | } 255 | 256 | return $this->parent; 257 | } 258 | 259 | // verify that node has children 260 | function has_child() 261 | { 262 | return !empty($this->children); 263 | } 264 | 265 | // returns children of node 266 | function children($idx=-1) 267 | { 268 | if ($idx===-1) 269 | { 270 | return $this->children; 271 | } 272 | if (isset($this->children[$idx])) 273 | { 274 | return $this->children[$idx]; 275 | } 276 | return null; 277 | } 278 | 279 | // returns the first child of node 280 | function first_child() 281 | { 282 | if (count($this->children)>0) 283 | { 284 | return $this->children[0]; 285 | } 286 | return null; 287 | } 288 | 289 | // returns the last child of node 290 | function last_child() 291 | { 292 | if (($count=count($this->children))>0) 293 | { 294 | return $this->children[$count-1]; 295 | } 296 | return null; 297 | } 298 | 299 | // returns the next sibling of node 300 | function next_sibling() 301 | { 302 | if ($this->parent===null) 303 | { 304 | return null; 305 | } 306 | 307 | $idx = 0; 308 | $count = count($this->parent->children); 309 | while ($idx<$count && $this!==$this->parent->children[$idx]) 310 | { 311 | ++$idx; 312 | } 313 | if (++$idx>=$count) 314 | { 315 | return null; 316 | } 317 | return $this->parent->children[$idx]; 318 | } 319 | 320 | // returns the previous sibling of node 321 | function prev_sibling() 322 | { 323 | if ($this->parent===null) return null; 324 | $idx = 0; 325 | $count = count($this->parent->children); 326 | while ($idx<$count && $this!==$this->parent->children[$idx]) 327 | ++$idx; 328 | if (--$idx<0) return null; 329 | return $this->parent->children[$idx]; 330 | } 331 | 332 | // function to locate a specific ancestor tag in the path to the root. 333 | function find_ancestor_tag($tag) 334 | { 335 | global $debug_object; 336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 337 | 338 | // Start by including ourselves in the comparison. 339 | $returnDom = $this; 340 | 341 | while (!is_null($returnDom)) 342 | { 343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } 344 | 345 | if ($returnDom->tag == $tag) 346 | { 347 | break; 348 | } 349 | $returnDom = $returnDom->parent; 350 | } 351 | return $returnDom; 352 | } 353 | 354 | // get dom node's inner html 355 | function innertext() 356 | { 357 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 358 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 359 | 360 | $ret = ''; 361 | foreach ($this->nodes as $n) 362 | $ret .= $n->outertext(); 363 | return $ret; 364 | } 365 | 366 | // get dom node's outer text (with tag) 367 | function outertext() 368 | { 369 | global $debug_object; 370 | if (is_object($debug_object)) 371 | { 372 | $text = ''; 373 | if ($this->tag == 'text') 374 | { 375 | if (!empty($this->text)) 376 | { 377 | $text = " with text: " . $this->text; 378 | } 379 | } 380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 381 | } 382 | 383 | if ($this->tag==='root') return $this->innertext(); 384 | 385 | // trigger callback 386 | if ($this->dom && $this->dom->callback!==null) 387 | { 388 | call_user_func_array($this->dom->callback, array($this)); 389 | } 390 | 391 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 392 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 393 | 394 | // render begin tag 395 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 396 | { 397 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 398 | } else { 399 | $ret = ""; 400 | } 401 | 402 | // render inner text 403 | if (isset($this->_[HDOM_INFO_INNER])) 404 | { 405 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 406 | if ($this->tag != "br") 407 | { 408 | $ret .= $this->_[HDOM_INFO_INNER]; 409 | } 410 | } else { 411 | if ($this->nodes) 412 | { 413 | foreach ($this->nodes as $n) 414 | { 415 | $ret .= $this->convert_text($n->outertext()); 416 | } 417 | } 418 | } 419 | 420 | // render end tag 421 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 422 | $ret .= 'tag.'>'; 423 | return $ret; 424 | } 425 | 426 | // get dom node's plain text 427 | function text() 428 | { 429 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 430 | switch ($this->nodetype) 431 | { 432 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 433 | case HDOM_TYPE_COMMENT: return ''; 434 | case HDOM_TYPE_UNKNOWN: return ''; 435 | } 436 | if (strcasecmp($this->tag, 'script')===0) return ''; 437 | if (strcasecmp($this->tag, 'style')===0) return ''; 438 | 439 | $ret = ''; 440 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 441 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 442 | // WHY is this happening? 443 | if (!is_null($this->nodes)) 444 | { 445 | foreach ($this->nodes as $n) 446 | { 447 | $ret .= $this->convert_text($n->text()); 448 | } 449 | 450 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 451 | if ($this->tag == "span") 452 | { 453 | $ret .= $this->dom->default_span_text; 454 | } 455 | 456 | 457 | } 458 | return $ret; 459 | } 460 | 461 | function xmltext() 462 | { 463 | $ret = $this->innertext(); 464 | $ret = str_ireplace('', '', $ret); 466 | return $ret; 467 | } 468 | 469 | // build node's text with tag 470 | function makeup() 471 | { 472 | // text, comment, unknown 473 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 474 | 475 | $ret = '<'.$this->tag; 476 | $i = -1; 477 | 478 | foreach ($this->attr as $key=>$val) 479 | { 480 | ++$i; 481 | 482 | // skip removed attribute 483 | if ($val===null || $val===false) 484 | continue; 485 | 486 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 487 | //no value attr: nowrap, checked selected... 488 | if ($val===true) 489 | $ret .= $key; 490 | else { 491 | switch ($this->_[HDOM_INFO_QUOTE][$i]) 492 | { 493 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 494 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; 495 | default: $quote = ''; 496 | } 497 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 498 | } 499 | } 500 | $ret = $this->dom->restore_noise($ret); 501 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 502 | } 503 | 504 | // find elements by css selector 505 | //PaperG - added ability for find to lowercase the value of the selector. 506 | function find($selector, $idx=null, $lowercase=false) 507 | { 508 | $selectors = $this->parse_selector($selector); 509 | if (($count=count($selectors))===0) return array(); 510 | $found_keys = array(); 511 | 512 | // find each selector 513 | for ($c=0; $c<$count; ++$c) 514 | { 515 | // The change on the below line was documented on the sourceforge code tracker id 2788009 516 | // used to be: if (($levle=count($selectors[0]))===0) return array(); 517 | if (($levle=count($selectors[$c]))===0) return array(); 518 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 519 | 520 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); 521 | 522 | // handle descendant selectors, no recursive! 523 | for ($l=0; $l<$levle; ++$l) 524 | { 525 | $ret = array(); 526 | foreach ($head as $k=>$v) 527 | { 528 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 529 | //PaperG - Pass this optional parameter on to the seek function. 530 | $n->seek($selectors[$c][$l], $ret, $lowercase); 531 | } 532 | $head = $ret; 533 | } 534 | 535 | foreach ($head as $k=>$v) 536 | { 537 | if (!isset($found_keys[$k])) 538 | { 539 | $found_keys[$k] = 1; 540 | } 541 | } 542 | } 543 | 544 | // sort keys 545 | ksort($found_keys); 546 | 547 | $found = array(); 548 | foreach ($found_keys as $k=>$v) 549 | $found[] = $this->dom->nodes[$k]; 550 | 551 | // return nth-element or array 552 | if (is_null($idx)) return $found; 553 | else if ($idx<0) $idx = count($found) + $idx; 554 | return (isset($found[$idx])) ? $found[$idx] : null; 555 | } 556 | 557 | // seek for given conditions 558 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 559 | protected function seek($selector, &$ret, $lowercase=false) 560 | { 561 | global $debug_object; 562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 563 | 564 | list($tag, $key, $val, $exp, $no_key) = $selector; 565 | 566 | // xpath index 567 | if ($tag && $key && is_numeric($key)) 568 | { 569 | $count = 0; 570 | foreach ($this->children as $c) 571 | { 572 | if ($tag==='*' || $tag===$c->tag) { 573 | if (++$count==$key) { 574 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 575 | return; 576 | } 577 | } 578 | } 579 | return; 580 | } 581 | 582 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 583 | if ($end==0) { 584 | $parent = $this->parent; 585 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 586 | $end -= 1; 587 | $parent = $parent->parent; 588 | } 589 | $end += $parent->_[HDOM_INFO_END]; 590 | } 591 | 592 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 593 | $node = $this->dom->nodes[$i]; 594 | 595 | $pass = true; 596 | 597 | if ($tag==='*' && !$key) { 598 | if (in_array($node, $this->children, true)) 599 | $ret[$i] = 1; 600 | continue; 601 | } 602 | 603 | // compare tag 604 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 605 | // compare key 606 | if ($pass && $key) { 607 | if ($no_key) { 608 | if (isset($node->attr[$key])) $pass=false; 609 | } else { 610 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 611 | } 612 | } 613 | // compare value 614 | if ($pass && $key && $val && $val!=='*') { 615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 616 | if ($key == "plaintext") { 617 | // $node->plaintext actually returns $node->text(); 618 | $nodeKeyValue = $node->text(); 619 | } else { 620 | // this is a normal search, we want the value of that attribute of the tag. 621 | $nodeKeyValue = $node->attr[$key]; 622 | } 623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 624 | 625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 626 | if ($lowercase) { 627 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 628 | } else { 629 | $check = $this->match($exp, $val, $nodeKeyValue); 630 | } 631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} 632 | 633 | // handle multiple class 634 | if (!$check && strcasecmp($key, 'class')===0) { 635 | foreach (explode(' ',$node->attr[$key]) as $k) { 636 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 637 | if (!empty($k)) { 638 | if ($lowercase) { 639 | $check = $this->match($exp, strtolower($val), strtolower($k)); 640 | } else { 641 | $check = $this->match($exp, $val, $k); 642 | } 643 | if ($check) break; 644 | } 645 | } 646 | } 647 | if (!$check) $pass = false; 648 | } 649 | if ($pass) $ret[$i] = 1; 650 | unset($node); 651 | } 652 | // It's passed by reference so this is actually what this function returns. 653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} 654 | } 655 | 656 | protected function match($exp, $pattern, $value) { 657 | global $debug_object; 658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 659 | 660 | switch ($exp) { 661 | case '=': 662 | return ($value===$pattern); 663 | case '!=': 664 | return ($value!==$pattern); 665 | case '^=': 666 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); 667 | case '$=': 668 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); 669 | case '*=': 670 | if ($pattern[0]=='/') { 671 | return preg_match($pattern, $value); 672 | } 673 | return preg_match("/".$pattern."/i", $value); 674 | } 675 | return false; 676 | } 677 | 678 | protected function parse_selector($selector_string) { 679 | global $debug_object; 680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 681 | 682 | // pattern of CSS selectors, modified from mootools 683 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does. 684 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 685 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 687 | // farther study is required to determine of this should be documented or removed. 688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} 692 | 693 | $selectors = array(); 694 | $result = array(); 695 | //print_r($matches); 696 | 697 | foreach ($matches as $m) { 698 | $m[0] = trim($m[0]); 699 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 700 | // for browser generated xpath 701 | if ($m[1]==='tbody') continue; 702 | 703 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 704 | if (!empty($m[2])) {$key='id'; $val=$m[2];} 705 | if (!empty($m[3])) {$key='class'; $val=$m[3];} 706 | if (!empty($m[4])) {$key=$m[4];} 707 | if (!empty($m[5])) {$exp=$m[5];} 708 | if (!empty($m[6])) {$val=$m[6];} 709 | 710 | // convert to lowercase 711 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 712 | //elements that do NOT have the specified attribute 713 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 714 | 715 | $result[] = array($tag, $key, $val, $exp, $no_key); 716 | if (trim($m[7])===',') { 717 | $selectors[] = $result; 718 | $result = array(); 719 | } 720 | } 721 | if (count($result)>0) 722 | $selectors[] = $result; 723 | return $selectors; 724 | } 725 | 726 | function __get($name) 727 | { 728 | if (isset($this->attr[$name])) 729 | { 730 | return $this->convert_text($this->attr[$name]); 731 | } 732 | switch ($name) 733 | { 734 | case 'outertext': return $this->outertext(); 735 | case 'innertext': return $this->innertext(); 736 | case 'plaintext': return $this->text(); 737 | case 'xmltext': return $this->xmltext(); 738 | default: return array_key_exists($name, $this->attr); 739 | } 740 | } 741 | 742 | function __set($name, $value) 743 | { 744 | global $debug_object; 745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 746 | 747 | switch ($name) 748 | { 749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 750 | case 'innertext': 751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 752 | return $this->_[HDOM_INFO_INNER] = $value; 753 | } 754 | if (!isset($this->attr[$name])) 755 | { 756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 758 | } 759 | $this->attr[$name] = $value; 760 | } 761 | 762 | function __isset($name) 763 | { 764 | switch ($name) 765 | { 766 | case 'outertext': return true; 767 | case 'innertext': return true; 768 | case 'plaintext': return true; 769 | } 770 | //no value attr: nowrap, checked selected... 771 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 772 | } 773 | 774 | function __unset($name) { 775 | if (isset($this->attr[$name])) 776 | unset($this->attr[$name]); 777 | } 778 | 779 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 780 | function convert_text($text) 781 | { 782 | global $debug_object; 783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 784 | 785 | $converted_text = $text; 786 | 787 | $sourceCharset = ""; 788 | $targetCharset = ""; 789 | 790 | if ($this->dom) 791 | { 792 | $sourceCharset = strtoupper($this->dom->_charset); 793 | $targetCharset = strtoupper($this->dom->_target_charset); 794 | } 795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 796 | 797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 798 | { 799 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 800 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 801 | { 802 | $converted_text = $text; 803 | } 804 | else 805 | { 806 | $converted_text = iconv($sourceCharset, $targetCharset, $text); 807 | } 808 | } 809 | 810 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 811 | if ($targetCharset == 'UTF-8') 812 | { 813 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 814 | { 815 | $converted_text = substr($converted_text, 3); 816 | } 817 | if (substr($converted_text, -3) == "\xef\xbb\xbf") 818 | { 819 | $converted_text = substr($converted_text, 0, -3); 820 | } 821 | } 822 | 823 | return $converted_text; 824 | } 825 | 826 | /** 827 | * Returns true if $string is valid UTF-8 and false otherwise. 828 | * 829 | * @param mixed $str String to be tested 830 | * @return boolean 831 | */ 832 | static function is_utf8($str) 833 | { 834 | $c=0; $b=0; 835 | $bits=0; 836 | $len=strlen($str); 837 | for($i=0; $i<$len; $i++) 838 | { 839 | $c=ord($str[$i]); 840 | if($c > 128) 841 | { 842 | if(($c >= 254)) return false; 843 | elseif($c >= 252) $bits=6; 844 | elseif($c >= 248) $bits=5; 845 | elseif($c >= 240) $bits=4; 846 | elseif($c >= 224) $bits=3; 847 | elseif($c >= 192) $bits=2; 848 | else return false; 849 | if(($i+$bits) > $len) return false; 850 | while($bits > 1) 851 | { 852 | $i++; 853 | $b=ord($str[$i]); 854 | if($b < 128 || $b > 191) return false; 855 | $bits--; 856 | } 857 | } 858 | } 859 | return true; 860 | } 861 | /* 862 | function is_utf8($string) 863 | { 864 | //this is buggy 865 | return (utf8_encode(utf8_decode($string)) == $string); 866 | } 867 | */ 868 | 869 | /** 870 | * Function to try a few tricks to determine the displayed size of an img on the page. 871 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 872 | * 873 | * @author John Schlick 874 | * @version April 19 2012 875 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 876 | */ 877 | function get_display_size() 878 | { 879 | global $debug_object; 880 | 881 | $width = -1; 882 | $height = -1; 883 | 884 | if ($this->tag !== 'img') 885 | { 886 | return false; 887 | } 888 | 889 | // See if there is aheight or width attribute in the tag itself. 890 | if (isset($this->attr['width'])) 891 | { 892 | $width = $this->attr['width']; 893 | } 894 | 895 | if (isset($this->attr['height'])) 896 | { 897 | $height = $this->attr['height']; 898 | } 899 | 900 | // Now look for an inline style. 901 | if (isset($this->attr['style'])) 902 | { 903 | // Thanks to user gnarf from stackoverflow for this regular expression. 904 | $attributes = array(); 905 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 906 | foreach ($matches as $match) { 907 | $attributes[$match[1]] = $match[2]; 908 | } 909 | 910 | // If there is a width in the style attributes: 911 | if (isset($attributes['width']) && $width == -1) 912 | { 913 | // check that the last two characters are px (pixels) 914 | if (strtolower(substr($attributes['width'], -2)) == 'px') 915 | { 916 | $proposed_width = substr($attributes['width'], 0, -2); 917 | // Now make sure that it's an integer and not something stupid. 918 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 919 | { 920 | $width = $proposed_width; 921 | } 922 | } 923 | } 924 | 925 | // If there is a width in the style attributes: 926 | if (isset($attributes['height']) && $height == -1) 927 | { 928 | // check that the last two characters are px (pixels) 929 | if (strtolower(substr($attributes['height'], -2)) == 'px') 930 | { 931 | $proposed_height = substr($attributes['height'], 0, -2); 932 | // Now make sure that it's an integer and not something stupid. 933 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 934 | { 935 | $height = $proposed_height; 936 | } 937 | } 938 | } 939 | 940 | } 941 | 942 | // Future enhancement: 943 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 944 | 945 | // Far future enhancement 946 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 947 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. 948 | 949 | // ridiculously far future development 950 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 951 | 952 | $result = array('height' => $height, 953 | 'width' => $width); 954 | return $result; 955 | } 956 | 957 | // camel naming conventions 958 | function getAllAttributes() {return $this->attr;} 959 | function getAttribute($name) {return $this->__get($name);} 960 | function setAttribute($name, $value) {$this->__set($name, $value);} 961 | function hasAttribute($name) {return $this->__isset($name);} 962 | function removeAttribute($name) {$this->__set($name, null);} 963 | function getElementById($id) {return $this->find("#$id", 0);} 964 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 965 | function getElementByTagName($name) {return $this->find($name, 0);} 966 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 967 | function parentNode() {return $this->parent();} 968 | function childNodes($idx=-1) {return $this->children($idx);} 969 | function firstChild() {return $this->first_child();} 970 | function lastChild() {return $this->last_child();} 971 | function nextSibling() {return $this->next_sibling();} 972 | function previousSibling() {return $this->prev_sibling();} 973 | function hasChildNodes() {return $this->has_child();} 974 | function nodeName() {return $this->tag;} 975 | function appendChild($node) {$node->parent($this); return $node;} 976 | 977 | } 978 | 979 | /** 980 | * simple html dom parser 981 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 982 | * Paperg - change $size from protected to public so we can easily access it 983 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 984 | * 985 | * @package PlaceLocalInclude 986 | */ 987 | class simple_html_dom 988 | { 989 | public $root = null; 990 | public $nodes = array(); 991 | public $callback = null; 992 | public $lowercase = false; 993 | // Used to keep track of how large the text was when we started. 994 | public $original_size; 995 | public $size; 996 | protected $pos; 997 | protected $doc; 998 | protected $char; 999 | protected $cursor; 1000 | protected $parent; 1001 | protected $noise = array(); 1002 | protected $token_blank = " \t\r\n"; 1003 | protected $token_equal = ' =/>'; 1004 | protected $token_slash = " />\r\n\t"; 1005 | protected $token_attr = ' >'; 1006 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 1007 | public $_charset = ''; 1008 | public $_target_charset = ''; 1009 | protected $default_br_text = ""; 1010 | public $default_span_text = ""; 1011 | 1012 | // use isset instead of in_array, performance boost about 30%... 1013 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 1014 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 1015 | // Known sourceforge issue #2977341 1016 | // B tags that are not closed cause us to return everything to the end of the document. 1017 | protected $optional_closing_tags = array( 1018 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 1019 | 'th'=>array('th'=>1), 1020 | 'td'=>array('td'=>1), 1021 | 'li'=>array('li'=>1), 1022 | 'dt'=>array('dt'=>1, 'dd'=>1), 1023 | 'dd'=>array('dd'=>1, 'dt'=>1), 1024 | 'dl'=>array('dd'=>1, 'dt'=>1), 1025 | 'p'=>array('p'=>1), 1026 | 'nobr'=>array('nobr'=>1), 1027 | 'b'=>array('b'=>1), 1028 | 'option'=>array('option'=>1), 1029 | ); 1030 | 1031 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1032 | { 1033 | if ($str) 1034 | { 1035 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1036 | { 1037 | $this->load_file($str); 1038 | } 1039 | else 1040 | { 1041 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1042 | } 1043 | } 1044 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1045 | if (!$forceTagsClosed) { 1046 | $this->optional_closing_array=array(); 1047 | } 1048 | $this->_target_charset = $target_charset; 1049 | } 1050 | 1051 | function __destruct() 1052 | { 1053 | $this->clear(); 1054 | } 1055 | 1056 | // load html from string 1057 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1058 | { 1059 | global $debug_object; 1060 | 1061 | // prepare 1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1063 | // strip out cdata 1064 | $this->remove_noise("''is", true); 1065 | // strip out comments 1066 | $this->remove_noise("''is"); 1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1068 | // Script tags removal now preceeds style tag removal. 1069 | // strip out