├── _config.yml
├── .gitignore
├── proxy.json
├── index.php
├── README.md
└── class
├── sahibinden.class.php
└── simple_html_dom.php
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
3 | /.idea/
4 | _config.yml
5 |
--------------------------------------------------------------------------------
/proxy.json:
--------------------------------------------------------------------------------
1 | [
2 | "193.255.1.102:8080",
3 | "178.250.90.18:8080",
4 | "88.255.225.58:8080"
5 | ]
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | Kategori();
18 | //echo $Sahibinden->Kategori("xml","emlak",true); //Emlak Kategorisindekiler proxy kullanara xml formatında döndürür
19 |
20 | //Alt Kategoriler
21 | //@return xml,json,array
22 |
23 | //echo $Sahibinden->Kategori("json","ozel-ders-verenler");
24 | //echo $Sahibinden->Kategori("json","kiralik");
25 |
26 |
27 | //Listeler
28 | //@return xml,json,array
29 | $filters = array(
30 | //"date" => "1days", //1,3,7,15,30 //1 günlük ilanlar
31 | "address_city" => array("34"), //il plaka kodu
32 | "address_town" => array("451"), //ilçe kodu
33 | //"price_currency" => "1", //1=TL, 2=USD, 3=EUR, 4=GBP //para birimi
34 | //"price_min" => "5000", //minimum fiyat
35 | // "price_max" => "12000", //maximum fiyat
36 | // "hasVideo" => "false", //videolu ilanlar
37 | //"hasPhoto" => "true", //fotoğrafı olan ilanlar
38 | //"hasMegaPhoto" => "false", // büyük fotoğrafı olan ilanlar
39 | "sorting" => "price_asc" //sıralama price_asc, price_desc, date_asc, date_desc, address_desc, address_asc
40 | );
41 |
42 | //print_r($Sahibinden->Liste('emlak',40,$filters,"array")); // Kiralık Ev Kategorisinden filtrelere uygun 40 kaydı array formatında döndürür
43 | echo $Sahibinden->Liste('kiralik-daire',100,$filters,'json'); //Emlak Kategorisinden 20 Kaydı JSON formatında döndürür.
44 |
45 | //İl ve İlçe Kodları (Filtreleme için)
46 | //@return xml,json,array
47 | //echo $Sahibinden->TownCodes(NULL, "xml"); //Tüm il ve ilçeleri XML formatında döndürür
48 | //echo $Sahibinden->TownCodes(34); // İstanbul ilçelerini JSON formatında döndürür
49 |
50 |
51 |
52 | //İlan Detayı
53 |
54 | //echo $Sahibinden->Detay("/ilan/vasita-otomobil-lotus-lotus-cars-turkey-elise-20th-edition-398612300/detay","json");
55 |
56 |
57 | //Mağaza Bilgileri
58 | $stores = array("remaxpiramit",
59 | "vatanotomobil",
60 | "blackmotors");
61 | //echo $Sahibinden->Magaza($stores);
62 |
63 | //Mağaza Kategorilerini Alt Kategoriyle birlikte
64 | //print_r($Sahibinden->MagazaKategori("remaxpiramit",NULL,"array"));
65 | //echo $Sahibinden->MagazaKategori("remaxpiramit",NULL,"json",true);// Mağaza Kategorilerini proxy ile json formatında getirir
66 |
67 |
68 |
69 | //Mağaza İlan Listesi
70 | $filters = array(
71 | "userId" => "57127" //Birden fazla seçilemez
72 | );
73 | //echo $Sahibinden->MagazaListe("remaxpiramit",21);
74 |
75 | //Mağaza Danışman Listesi
76 |
77 | //echo $Sahibinden->MagazaDanismanlari("remaxpiramit","json");
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Hakkında
2 | ====================
3 |
4 |
5 | Sahibinden.com için @tayfunerbilen 'in eskiden hazırlamış olduğu bot'u güncel hale getirdim. Ve yeni özellikler eklemeye devam edeceğim.
6 | Şuan güzel bir şekilde; kategorileri, alt kategorileri, kategori listelerini ve detayları çekmektedir.
7 |
8 | 3 formattan dilediğinizi döndürebilirsiniz.
9 | * JSON
10 | * Array
11 | * XML
12 |
13 | Kullanımına aşağıdan bakabilirsiniz.
14 |
15 | * Default olarak json değer dönmektedir.
16 | * Proxy kullanımı istek sürenizi uzatabilir
17 |
18 | Kullanımı
19 | =
20 |
21 |
Tanımlama
22 |
23 | ```php
24 |
25 | require 'class/sahibinden.class.php';
26 | $Sahibinden = new Sahibinden();
27 |
28 | ```
29 |
30 |
31 | Ana Kategoriler
32 | @return xml,json,array
33 |
34 | ```php
35 |
36 | echo $Sahibinden->Kategori();
37 | echo $Sahibinden->Kategori("xml","emlak",true); //Emlak Kategorisindekiler proxy kullanara xml formatında döndürür
38 |
39 | ```
40 |
41 |
42 | Alt Kategoriler
43 | @return xml,json,array
44 |
45 |
46 | ```php
47 |
48 | echo $Sahibinden->Kategori("json","ozel-ders-verenler");
49 | echo $Sahibinden->Kategori("json","kiralik");
50 |
51 | ```
52 |
53 |
54 | Listeler
55 | Sahibinden'de ilan içerisinde kullanılan tüm GET parametrelerini "filters" dizisine key=>value şeklinde ekleyerek filtremeleri yapabilirsiniz
56 | @return xml,json,array
57 |
58 |
59 | ```php
60 |
61 | $filters = array(
62 | "date" => "1days", //1,3,7,15,30 //1 günlük ilanlar
63 | "address_city" => "34", //il plaka kodu
64 | "address_town" => "71", //ilçe kodu
65 | "price_currency" => "1", //1=TL, 2=USD, 3=EUR, 4=GBP //para birimi
66 | "price_min" => "0", //minimum fiyat
67 | "price_max" => "12000", //maximum fiyat
68 | "hasVideo" => "false", //videolu ilanlar
69 | "hasPhoto" => "true", //fotoğrafı olan ilanlar
70 | "hasMegaPhoto" => "false", // büyük fotoğrafı olan ilanlar
71 | "sorting" => "price_asc" //sıralama price_asc, price_desc, date_asc, date_desc, address_desc, address_asc
72 | );
73 |
74 | print_r($Sahibinden->Liste('kiralik',40,$filters,"array")); // Kiralık Ev Kategorisinden filtrelere uygun 40 kaydı array formatında döndürür
75 | echo $Sahibinden->Liste('emlak'); //Emlak Kategorisinden 20 Kaydı JSON formatında döndürür.
76 |
77 | ```
78 |
79 |
80 | İl ve İlçe Kodları (Filtreleme için)
81 | @return xml,json,array
82 |
83 | ```php
84 |
85 | echo $Sahibinden->TownCodes(NULL, "xml"); //Tüm il ve ilçeleri XML formatında döndürür
86 | echo $Sahibinden->TownCodes(34); // İstanbul ilçelerini JSON formatında döndürür
87 |
88 | ```
89 |
90 | İlan Detayı
91 | @return xml,json,array
92 |
93 | ```php
94 |
95 | echo $Sahibinden->Detay("/ilan/vasita-otomobil-lotus-lotus-cars-turkey-elise-20th-edition-398612300/detay","json");
96 |
97 | ```
98 |
99 |
100 |
101 | Mağazalar
102 | -
103 |
104 | Mağaza Bilgileri
105 |
106 | ```php
107 |
108 | $stores = array("remaxpiramit",
109 | "vatanotomobil",
110 | "blackmotors");
111 | echo $Sahibinden->Magaza($stores);
112 |
113 | ```
114 |
115 | Mağaza Kategorileri (Alt Kategorileri ile birlikte)
116 | @return xml,json,array
117 |
118 | ```php
119 |
120 | print_r($Sahibinden->MagazaKategori("remaxpiramit",NULL,"array"));
121 | echo $Sahibinden->MagazaKategori("remaxpiramit",NULL,"json",true);// Mağaza Kategorilerini proxy ile json formatında getirir
122 |
123 | ```
124 |
125 |
126 | Mağaza İlan Listesi
127 | @return xml,json,array
128 |
129 | ```php
130 |
131 | $filters = array(
132 | "userId" => "57127"
133 | );
134 | echo $Sahibinden->MagazaListe("remaxpiramit",20,$filters);
135 |
136 | ```
137 |
138 |
139 | Mağaza Danışman Listesi
140 | @return xml,json,array
141 |
142 | ```php
143 |
144 | echo $Sahibinden->MagazaDanismanlari("remaxpiramit","json");
145 |
146 | ```
147 |
148 |
149 |
150 |
151 |
--------------------------------------------------------------------------------
/class/sahibinden.class.php:
--------------------------------------------------------------------------------
1 | data = array();
30 |
31 | if (empty($url)) {
32 | if ($proxy == true) {
33 | $open = $this->Curl($this->baseUrl, true);
34 | } else {
35 | $open = $this->Curl($this->baseUrl);
36 | }
37 | if (!empty(@$open["error"])) {
38 | if (!str_get_html($open)->find("div.errorPage404")) {
39 |
40 |
41 | $items = str_get_html($open)->find("ul.categories-left-menu", 0)->find("li a[title]");
42 | if (count($items) > 0) {
43 | foreach ($items as $element) {
44 | $this->data[] = array("title" => trim($element->plaintext),
45 | "uri" => trim($element->href),
46 | "url" => $this->baseUrl . trim($element->href)
47 | );
48 |
49 | }
50 | } else {
51 | $this->data[] = array("error" => true, "url" => $this->baseUrl, "message" => "Sonuç Bulunamadı.");
52 |
53 | }
54 |
55 | } else {
56 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı.");
57 | }
58 | } else {
59 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]);
60 | }
61 |
62 | } else {
63 | $url = $this->baseUrl . '/kategori/' . $url;
64 | if ($proxy == true) {
65 | $open = $this->Curl($url, true);
66 | } else {
67 | $open = $this->Curl($url);
68 | }
69 | if (!empty(@$open["error"])) {
70 | if (!str_get_html($open)->find("div.errorPage404")) {
71 |
72 |
73 | $items = str_get_html($open)->find("ul.categoryList", 0)->find("li a");
74 | if (count($items) > 0) {
75 | foreach ($items as $element) {
76 | $this->data[] = array("title" => trim($element->plaintext),
77 | "uri" => trim($element->href),
78 | "url" => $this->baseUrl . trim($element->href)
79 | );
80 | }
81 | } else {
82 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı.");
83 |
84 | }
85 | } else {
86 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı.");
87 | }
88 | } else {
89 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]);
90 | }
91 |
92 | }
93 |
94 |
95 | return $this->ReturnWithTypes($type);
96 |
97 | }
98 |
99 | /**
100 | * Kategoriye ait ilanları listeler.
101 | *
102 | * @param string $kategoriLink
103 | * @param int $itemCount
104 | * @param array $filters
105 | * @param string $type
106 | * @param $proxy false|true
107 | * @return json,array,xml
108 | */
109 | public function Liste($kategoriLink, $itemCount = 20, $filters = NULL, $type = "json", $proxy = false)
110 | {
111 | $this->data = array();
112 | $filterText = "";
113 | if (is_array($filters)) {
114 | foreach ($filters as $key => $val) {
115 | if (is_array($filters[$key])) {
116 | foreach ($filters[$key] as $v) {
117 | $filterText .= "&" . $key . "=" . $v;
118 | }
119 | } else {
120 | $filterText .= "&" . $key . "=" . $val;
121 | }
122 |
123 | }
124 | }
125 |
126 |
127 | if ($itemCount > 20) {
128 | $pageCount = ceil($itemCount / 20);
129 | } else {
130 | $pageCount = 1;
131 | }
132 |
133 | for ($p = 0; $p <= $pageCount - 1; $p++) {
134 | $page = $p * 20;
135 |
136 | $pageFilter = '?pagingOffset=' . $page;
137 | $url = $this->baseUrl . "/" . $kategoriLink . $pageFilter . $filterText;
138 | if ($proxy == true) {
139 | $open = $this->Curl($url, true);
140 | } else {
141 | $open = $this->Curl($url);
142 | }
143 | if (!empty(@$open["error"])) {
144 | if (!str_get_html($open)->find("div.errorPage404")) {
145 |
146 |
147 | $links = str_get_html($open)->find("td.searchResultsLargeThumbnail a");
148 | $images = str_get_html($open)->find("td.searchResultsLargeThumbnail a img");
149 | $prices = @str_get_html($open)->find("td.searchResultsPriceValue div");
150 | $dates = str_get_html($open)->find("td.searchResultsDateValue");
151 | $addresses = str_get_html($open)->find("td.searchResultsLocationValue");
152 | $resultText = str_get_html($open)->find("div.infoSearchResults div.result-text", 0)->plaintext;
153 | $resultCount = str_get_html($open)->find("div.infoSearchResults div.result-text span", 1)->plaintext;
154 |
155 | foreach ($links as $link) {
156 | $linkArray[] = array("link" => $this->baseUrl . trim($link->href));
157 | $uriArray[] = array("uri" => trim($link->href));
158 | }
159 | foreach ($images as $image) {
160 | $thumbArray[] = array("thumb" => trim($image->src));
161 | $imageArray[] = array("image" => str_replace("thmb_", "", trim($image->src)));
162 | $titleArray[] = array("title" => trim(explode("#", $image->alt)[0]));
163 | $idArray[] = array("id" => trim(explode("#", $image->alt)[1]));
164 | }
165 | foreach (@$prices as $price) {
166 | $priceArray[] = array("price" => trim($price->plaintext));
167 | }
168 | foreach ($dates as $date) {
169 | $dateArray[] = array("date" => str_replace("
", "", str_replace("", "", str_replace("", "", trim($date->plaintext)))));
170 | }
171 | foreach ($addresses as $address) {
172 | $addressArray[] = array("address" => str_replace("
", "", trim($address->plaintext)));
173 | }
174 |
175 |
176 | } else {
177 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı.");
178 | }
179 | } else {
180 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]);
181 | }
182 |
183 |
184 | }
185 |
186 | if (count(@$linkArray) > 0) {
187 | $this->data["properties"] = array("count" => $itemCount,
188 | "resultText" => str_replace('"', "'", $resultText),
189 | "resultCount" => intval(str_replace(".", "", str_replace(' ilan ', "", $resultCount))),
190 | "filters" => $filters,
191 | "url" => str_replace("pagingOffset=" . $page, "", $url));
192 | if(count($linkArray) < ($itemCount-1)){
193 | for ($i = 0; $i <= count($linkArray)- 1; $i++) {
194 | $this->data["results"][] = @array_merge($idArray[$i], $linkArray[$i], $uriArray[$i], $titleArray[$i], $thumbArray[$i], $imageArray[$i], $priceArray[$i], $dateArray[$i], $addressArray[$i]);
195 | }
196 | }else{
197 | for ($i = 0; $i <= $itemCount - 1; $i++) {
198 | $this->data["results"][] = @array_merge($idArray[$i], $linkArray[$i], $uriArray[$i], $titleArray[$i], $thumbArray[$i], $imageArray[$i], $priceArray[$i], $dateArray[$i], $addressArray[$i]);
199 | }
200 | }
201 |
202 |
203 | } else {
204 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı.");
205 | }
206 |
207 | return $this->ReturnWithTypes($type);
208 |
209 | }
210 |
211 |
212 | /**
213 | * İlan detaylarını listeler.
214 | *
215 | * @param null $url
216 | * @param json $type
217 | * @param $proxy false|true
218 | * @return JSON,XML,Array
219 | */
220 | public function Detay($uri = NULL, $type = "json", $proxy = false)
221 | {
222 | $this->data = array();
223 | $url = $this->baseUrl . $uri;
224 | if ($uri != NULL) {
225 | if ($proxy == true) {
226 | $open = $this->Curl($url, true);
227 | } else {
228 | $open = $this->Curl($url);
229 | }
230 |
231 | if (!empty(@$open["error"])) {
232 | if (!str_get_html($open)->find("div.errorPage404")) {
233 |
234 | $title = str_get_html($open)->find("div.classifiedDetailTitle h1", 0);
235 | $this->data = array(
236 | "url" => $url,
237 | "title" => $title->plaintext,
238 | "breadCrumb" => $this->getDetailBreadcrumb($open),
239 | "address" => $this->getDetailAddress($open),
240 | "price" => $this->getDetailPrice($open),
241 | "seller" => $this->getDetailSeller($open),
242 | "coordinates" => $this->getDetailCoordinates($open),
243 | "info" => $this->getDetailInfo($open),
244 | "properties" => $this->getDetailProperties($open),
245 | "description" => $this->getDetailDescription($open),
246 | "media" => $this->getDetailMedia($open)
247 | );
248 |
249 |
250 | } else {
251 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sayfa Bulunamadı.");
252 | }
253 | } else {
254 | $this->data[] = array("error" => true, "url" => $url, "message" => $open["error"]);
255 | }
256 |
257 |
258 | } else {
259 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı.");
260 | }
261 |
262 |
263 | return $this->ReturnWithTypes($type);
264 |
265 | }
266 |
267 |
268 | /**
269 | * Mağazaya bilgilerini getirir
270 | *
271 | * @param string $storeName
272 | * @param array ,json,xml $type
273 | * @param $proxy false|true
274 | * @return json,array,xml
275 | * @return default = json
276 | */
277 | public function Magaza($storeName, $type = "json", $proxy = false)
278 | {
279 | $this->data = array();
280 | if (is_array($storeName)) {
281 | foreach ($storeName as $sn) {
282 | if (!empty($sn)) {
283 | $url = "https://" . $sn . $this->storeEndUrl;
284 | if ($proxy == true) {
285 | $open = $this->Curl($url, true);
286 | } else {
287 | $open = $this->Curl($url);
288 | }
289 | if (!empty(@$open["error"])) {
290 | if (!str_get_html($open)->find("div.errorPage404")) {
291 | $storeImage = str_get_html($open)->find("div.information-area div", 0)->find("a img", 0);
292 | $storePhone = str_get_html($open)->find("div.information-area div", 0)->find("p", 0);
293 | $storeCover = str_get_html($open)->find("div.theme", 0)->find("img", 0);
294 | $storeIlanSayisi = str_get_html($open)->find("div.classified-count", 0)->find("strong", 0);
295 | $storeAbout = str_get_html($open)->find("div.about", 0)->find("h2", 0);
296 | $this->data[] = array(
297 | "store_name" => $sn,
298 | "title" => trim($storeImage->alt),
299 | "about" => trim($storeAbout->plaintext),
300 | "profile-img" => $storeImage->src,
301 | "cover-img" => $storeCover->src,
302 | "phone" => trim(@$storePhone->plaintext),
303 | "ad-count" => intval(trim($storeIlanSayisi->plaintext)),
304 | );
305 | } else {
306 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => "Sayfa Bulunamadı.");
307 | }
308 | } else {
309 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => $open["error"]);
310 | }
311 | } else {
312 | $this->data[] = array("error" => true, "store_name" => $sn, "message" => "Mağaza adı boş olamaz");
313 | }
314 |
315 | }
316 |
317 |
318 | } else {
319 | $this->data = array("error" => true, "store_name" => $storeName, "message" => "Mağaza ad(lar)ı dizi olarak giriniz.");
320 | }
321 |
322 | return $this->ReturnWithTypes($type);
323 |
324 | }
325 |
326 |
327 | /**
328 | * Mağazaya ait ana ve alt Kategorileri listelemek için kullanılır
329 | *
330 | * @param string $storeName
331 | * @param null $kategori
332 | * @param array ,json,xml $type
333 | * @param $proxy false|true
334 | * @return json,array,xml
335 | * @return default = json
336 | */
337 | public function MagazaKategori($storeName, $kategori = NULL, $type = "json", $proxy = false)
338 | {
339 | if (!empty($storeName)) {
340 | if ($kategori == NULL) {
341 | $url = "https://" . $storeName . $this->storeEndUrl;
342 | } else {
343 | $url = "https://" . $storeName . $this->storeEndUrl . "/" . $kategori;
344 | }
345 |
346 |
347 | if ($proxy == true) {
348 | $open = $this->Curl($url, true);
349 | } else {
350 | $open = $this->Curl($url);
351 | }
352 | if (!empty(@$open["error"])) {
353 | if (!str_get_html($open)->find("div.errorPage404")) {
354 |
355 | for ($x = 0; $x <= 10; $x++) {
356 | $ul = str_get_html($open)->find("div.categories ul li.level" . $x);
357 | foreach ($ul as $u) {
358 | $categories = $u->find("a");
359 | foreach ($categories as $c) {
360 | $uri = explode("?", str_replace("/", "", $c->href));
361 | $cats = array(
362 | "title" => trim($c->plaintext),
363 | "uri" => $uri[0],
364 | "is_current_category" => $uri[0] == $kategori ? true : false,
365 | "url" => "https://" . $storeName . $this->storeEndUrl . "/" . $uri[0],
366 | "sub_categories" => NULL
367 | );
368 | }
369 | $level = str_replace("level", "", $u->class);
370 | if ($level == 0) {
371 | $this->data[] = $cats;
372 | } else if ($level == 1) {
373 | $this->data[$x - $level]["sub_categories"][] = $cats;
374 | } else if ($level == 2) {
375 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats;
376 | } else if ($level == 3) {
377 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats;
378 | } else if ($level == 4) {
379 | $this->data[$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][$x - $level]["sub_categories"][] = $cats;
380 | }
381 |
382 |
383 | }
384 | }
385 |
386 |
387 | } else {
388 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı.");
389 | }
390 | } else {
391 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]);
392 | }
393 |
394 | } else {
395 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Mağaza adı giriniz.");
396 | }
397 |
398 | return $this->ReturnWithTypes($type);
399 |
400 | }
401 |
402 |
403 | /**
404 | * Mağazaya ait ilanları listeler.
405 | *
406 | * @param string $storeName
407 | * @param int $itemCount
408 | * @param array $filters
409 | * @param string $type
410 | * @param $proxy false|true
411 | * @return JSON,XML,Array
412 | */
413 | public function MagazaListe($storeName, $itemCount = 20, $filters = NULL, $type = "json", $proxy = false)
414 | {
415 |
416 | $this->data = array();
417 | $filterText = "";
418 | if (is_array($filters)) {
419 | foreach ($filters as $key => $val) {
420 | if (is_array($filters[$key])) {
421 | foreach ($filters[$key] as $v) {
422 | $filterText .= "&" . $key . "=" . $v;
423 | }
424 | } else {
425 | $filterText .= "&" . $key . "=" . $val;
426 | }
427 |
428 | }
429 | }
430 |
431 |
432 | if ($itemCount > 20) {
433 | $pageCount = ceil($itemCount / 20);
434 | } else {
435 | $pageCount = 1;
436 | }
437 | $ic = 0;
438 | for ($p = 0; $p <= $pageCount - 1; $p++) {
439 | $page = $p * 20;
440 | $pageFilter = '?pagingOffset=' . $page;
441 | $url = "https://" . $storeName . $this->storeEndUrl . $pageFilter . $filterText;
442 | if ($proxy == true) {
443 | $open = $this->Curl($url, true);
444 | } else {
445 | $open = $this->Curl($url);
446 | }
447 |
448 | $columns = str_get_html($open)->find("div.classified-list table thead th");
449 | $tr = str_get_html($open)->find("div.classified-list table tbody tr");
450 | $colCount = count($columns);
451 | if (!empty(@$open["error"])) {
452 | if (!str_get_html($open)->find("div.errorPage404")) {
453 |
454 | if (count($tr) > 0) {
455 |
456 | for ($j = 1; $j <= count($tr) - 1; $j++) {
457 | if ($ic == $itemCount) {
458 | continue;
459 | } else {
460 | $d = array();
461 |
462 | $href = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", 0)->find("a", 0);
463 | $img = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", 0)->find("a", 0)->find("img", 0);
464 | $baslik = explode("#", $img->alt);
465 | $d["id"] = $baslik[1];
466 | $d["title"] = trim($baslik[0]);
467 | $d["link"] = $href->href;
468 | $d["image"] = $img->src;
469 |
470 | $imgExp = explode("/", $img->src);
471 | $thmb = "thmb_" . end($imgExp);
472 | array_pop($imgExp);
473 | array_push($imgExp, $thmb);
474 | $thumb = implode("/", $imgExp);
475 | $d["thumb"] = $thumb;
476 |
477 | for ($x = 0; $x <= $colCount - 1; $x++) {
478 | $row = str_get_html($open)->find("div.classified-list table tbody tr", $j)->find("td", $x);
479 | if (!empty(trim($columns[$x]->plaintext))) {
480 | $title = $this->turkishChars(strtolower(trim($columns[$x]->plaintext)));
481 | $d[$title] = trim($row->plaintext);
482 | }
483 | }
484 |
485 |
486 | $this->data[] = $d;
487 | $ic++;
488 | }
489 | }
490 | } else {
491 | $this->data[] = array("error" => true, "url" => $url, "message" => "Sonuç Bulunamadı.");
492 | }
493 |
494 | } else {
495 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı.");
496 | }
497 | } else {
498 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]);
499 | }
500 |
501 |
502 | }
503 |
504 |
505 | return $this->ReturnWithTypes($type);
506 |
507 | }
508 |
509 |
510 | /**
511 | * Belirtilen mağazanın danışman listesini döndürür
512 | *
513 | * @param $store_name string
514 | * @param $type string
515 | * @param $proxy false|true
516 | * @return JSON,XML,Array
517 | */
518 | public function MagazaDanismanlari($storeName, $type = "json", $proxy = false)
519 | {
520 |
521 | $this->data = array();
522 | if (!empty($storeName)) {
523 | $url = "https://" . $storeName . $this->storeEndUrl;
524 | if ($proxy == true) {
525 | $open = $this->Curl($url, true);
526 | } else {
527 | $open = $this->Curl($url);
528 | }
529 |
530 | if (!empty(@$open["error"])) {
531 | if (!str_get_html($open)->find("div.errorPage404")) {
532 |
533 |
534 | $agentsLink = str_get_html($open)->find("div.oc-select-list ul li a");
535 | $agentsName = str_get_html($open)->find("div.oc-select-list ul li a p");
536 | $agentsImg = str_get_html($open)->find("div.oc-select-list ul li a img");
537 | $agentsPhone = str_get_html($open)->find("div.oc-select-list ul li a span");
538 |
539 | for ($a = 0; $a <= count($agentsLink) - 1; $a++) {
540 | $agentID = explode("userId=", $agentsLink[$a]->href);
541 |
542 | $this->data[] = array(
543 | "name" => trim($agentsName[$a]->plaintext),
544 | "userId" => $agentID[1],
545 | "image_200" => $agentsImg[$a]->src,
546 | "image_400" => str_replace("p200", "p400", $agentsImg[$a]->src),
547 | "phone" => trim($agentsPhone[$a]->plaintext)
548 | );
549 | }
550 |
551 | } else {
552 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => "Sayfa Bulunamadı.");
553 | }
554 | } else {
555 | $this->data[] = array("error" => true, "store_name" => $storeName, "message" => $open["error"]);
556 | }
557 |
558 | } else {
559 | $this->data = array("error" => true, "store_name" => $storeName, "message" => "Mağaza adı bulunamadı.");
560 | }
561 |
562 | return $this->ReturnWithTypes($type);
563 |
564 | }
565 |
566 | /**
567 | * Sahibinden.com Filtrelemelerine uygun il içe isimleri ve kodlarını döndürür
568 | *
569 | * @param $il //Plaka Kodu
570 | * @param $type //Dönecek veri formatı
571 | * @param $proxy false|true
572 | * @return JSON,XML,Array
573 | */
574 | public function TownCodes($il = NULL, $type = "json")
575 | {
576 | /* ilce.html den gelen veri ilce.json a bu şekilde aktarıldı.
577 | $data = file_get_html("ilce.html")->find("li");
578 | foreach ($data as $e) {
579 | $this->data[$e->attr["data-parentid"]][] = array(
580 | "il-id" => $e->attr["data-parentid"],
581 | "il-adi" => $e->attr["data-parentlabel"],
582 | "ilce-id" => $e->attr["data-id"],
583 | "ilce-adi" => $e->plaintext);
584 | }
585 | */
586 | $ilceJson = json_decode(file_get_contents("ilce.json"), true);
587 | if ($type == "json") {
588 | if ($il != NULL) {
589 | return json_encode($ilceJson[$il]);
590 | } else {
591 | return json_encode($ilceJson);
592 | }
593 | } else if ($type == "array") {
594 | if ($il != NULL) {
595 | return $ilceJson[$il];
596 | } else {
597 | return $ilceJson;
598 | }
599 | } else if ($type == "xml") {
600 | if ($il != NULL) {
601 | $xml = new SimpleXMLElement('');
602 | $this->array_to_xml($ilceJson[$il], $xml);
603 | return $xml->asXML();
604 | } else {
605 | $xml = new SimpleXMLElement('');
606 | $this->array_to_xml($ilceJson, $xml);
607 | return $xml->asXML();
608 | }
609 | }
610 |
611 |
612 | }
613 |
614 |
615 | /**
616 | * Detay methodu için ilanın video ve fotoğraflarını getirir
617 | *
618 | * @param $html //Detay sayfası html
619 | * @return Array
620 | */
621 | private function getDetailMedia($html)
622 | {
623 | $images = str_get_html($html)->find("ul.classifiedDetailThumbListPages img");
624 | $movies = str_get_html($html)->find("source#mp4");
625 |
626 | $thumbArray = array();
627 | $imageArray = array();
628 | $megaArray = array();
629 | $movieArray = array();
630 | if (count($images) > 0) {
631 | foreach ($images as $img) {
632 | $thumbArray[] = $img->src;
633 | $imageArray[] = str_replace("thmb_", "", $img->src);
634 | $megaArray[] = str_replace("thmb_", "x16_", $img->src);
635 |
636 | }
637 | }
638 | if (count($movies) > 0) {
639 | foreach ($movies as $movie) {
640 | $movieArray[] = $movie->src;
641 | }
642 | }
643 |
644 |
645 | $return = array("thumb_images" => $thumbArray,
646 | "standart_images" => $imageArray,
647 | "mega_images" => $megaArray,
648 | "movies" => $movieArray);
649 |
650 | return $return;
651 |
652 |
653 | }
654 |
655 | /**
656 | * Detay methodu için ilanın özelliklerini getirir
657 | *
658 | * @param $html //Detay sayfası html
659 | * @return Array
660 | */
661 | private function getDetailProperties($html)
662 | {
663 | $propertyTitles = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("h3");
664 | $propertyCount = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("ul");
665 | $propertyArray = array();
666 | if (count($propertyCount) > 0) {
667 |
668 | for ($p = 0; $p <= count($propertyCount) - 1; $p++) {
669 | $propertyDetails = str_get_html($html)->find("div#classified-detail", 0)->find("div.uiBox", 1)->find("div.classifiedDescription", 0)->find("ul", $p)->find("li.selected");
670 | $ppDetails = array();
671 | if (count($propertyDetails) > 0) {
672 | for ($d = 0; $d <= count($propertyDetails) - 1; $d++) {
673 | $ppDetails[] = trim($propertyDetails[$d]->plaintext);
674 | }
675 | }
676 |
677 | $propertyArray[] = array(trim($propertyTitles[$p]->plaintext) => $ppDetails);
678 | }
679 | return $propertyArray;
680 | } else {
681 | return $propertyArray;
682 |
683 | }
684 |
685 |
686 | }
687 |
688 | /**
689 | * Detay methodu için ilanın breadcrumb'ını getirir
690 | *
691 | * @param $html //Detay sayfası html
692 | * @return Array
693 | */
694 | private function getDetailBreadcrumb($html)
695 | {
696 | $breadCrumb = str_get_html($html)->find("div.classifiedBreadCrumb ul li");
697 | $breadArray = array();
698 | if (count($breadCrumb) > 0) {
699 | foreach ($breadCrumb as $bc) {
700 | $breadArray[] = trim($bc->plaintext);
701 | }
702 | }
703 |
704 | return $breadArray;
705 |
706 |
707 | }
708 |
709 |
710 | /**
711 | * Detay methodu için ilanın adresini getirir
712 | *
713 | * @param $html //Detay sayfası html
714 | * @return Array
715 | */
716 | private function getDetailAddress($html)
717 | {
718 | $address = str_get_html($html)->find("div.classifiedInfo h2", 0)->find("a");
719 |
720 |
721 | $return = array(
722 | "city" => trim($address[0]->plaintext),
723 | "town" => trim($address[1]->plaintext),
724 | "district" => trim($address[2]->plaintext)
725 | );
726 |
727 | return $return;
728 |
729 | }
730 |
731 |
732 | /**
733 | * Detay methodu için ilanın koordinatlarını getirir
734 | *
735 | * @param $html //Detay sayfası html
736 | * @return Array
737 | */
738 | private function getDetailCoordinates($html)
739 | {
740 |
741 | $map = str_get_html($html)->find("div#gmap", 0);
742 | $return = array(
743 | "latitude" => trim(@$map->attr["data-lat"]),
744 | "longitude" => trim(@$map->attr["data-lon"])
745 | );
746 |
747 | return $return;
748 |
749 |
750 | }
751 |
752 |
753 | /**
754 | * Detay methodu için ilanın fiyatını getirir
755 | *
756 | * @param $html //Detay sayfası html
757 | * @return Array
758 | */
759 | private function getDetailPrice($html)
760 | {
761 |
762 | $price = str_get_html($html)->find("div.classifiedInfo h3", 0);
763 | $priceTrim = str_get_html($html)->find("div.classifiedInfo h3 a", 0);
764 |
765 |
766 | return trim(str_replace($priceTrim->plaintext, "", $price->plaintext));
767 |
768 | }
769 |
770 | /**
771 | * Detay methodu için ilan sahibinin bilgilerini getirir
772 | *
773 | * @param $html //Detay sayfası html
774 | * @return Array
775 | */
776 | private function getDetailSeller($html)
777 | {
778 |
779 | $sellerName = str_get_html($html)->find("div.classifiedUserContent h5", 0);
780 | $sellerStore = str_get_html($html)->find("a.userClassifieds", 0);
781 | $sellerImg = str_get_html($html)->find("div.classifiedUserContent a img", 0);
782 | $sellerPhoneFields = str_get_html($html)->find("ul#phoneInfoPart li strong");
783 | $sellerPhoneText = str_get_html($html)->find("ul#phoneInfoPart li span.pretty-phone-part");
784 |
785 | if (count($sellerPhoneFields) > 0) {
786 | for ($f = 0; $f <= count($sellerPhoneFields) - 1; $f++) {
787 | $phoneArray[] = array("title" => trim($sellerPhoneFields[$f]->plaintext), "text" => trim($sellerPhoneText[$f]->plaintext));
788 | }
789 | }
790 |
791 | $return = array(
792 | "name" => trim(@$sellerName->plaintext),
793 | "store_link" => $sellerStore->href,
794 | "image" => @$sellerImg->src,
795 | "phones" => @$phoneArray
796 | );
797 |
798 | return $return;
799 |
800 | }
801 |
802 | /**
803 | * Detay methodu için ilanın açıklamasını getirir
804 | *
805 | * @param $html //Detay sayfası html
806 | * @return Array
807 | */
808 | private function getDetailDescription($html)
809 | {
810 |
811 | $description = str_get_html($html)->find("div.classifiedDescription", 0);
812 |
813 | $return = array("text" => trim($description->plaintext),
814 | "base64" => base64_encode($description->innertext));
815 |
816 | return $return;
817 | }
818 |
819 | /**
820 | * Detay methodu için ilanın detaylarını getirir
821 | *
822 | * @param $html //Detay sayfası html
823 | * @return Array
824 | */
825 | private function getDetailInfo($html)
826 | {
827 |
828 | $infoListFields = str_get_html($html)->find("ul.classifiedInfoList li strong");
829 | $infoListTexts = str_get_html($html)->find("ul.classifiedInfoList li span");
830 | $infoArray = array();
831 | if (count($infoListFields) > 0) {
832 | for ($f = 0; $f <= count($infoListFields) - 1; $f++) {
833 | $infoArray[] = array("title" => trim($infoListFields[$f]->plaintext), "text" => trim($infoListTexts[$f]->plaintext));
834 | }
835 | }
836 |
837 | return $infoArray;
838 |
839 |
840 | }
841 |
842 | /**
843 | * Array formatından XML veya JSON formatı oluşturur
844 | *
845 | * @param $type //Dönecek veri formatı
846 | * @return JSON,XML,Array
847 | */
848 | private function ReturnWithTypes($type = "json")
849 | {
850 |
851 | if ($type == "json" or empty($type)) {
852 | return json_encode($this->data);
853 | } else if ($type == "array") {
854 | return $this->data;
855 | } else if ($type == "xml") {
856 | $xml = new SimpleXMLElement('');
857 | $this->array_to_xml($this->data, $xml);
858 | return $xml->asXML();
859 | }
860 | }
861 |
862 |
863 | /**
864 | * Türkçe Karakterleri İngilizce karaktere çevirir boşlukları "-" tireye çevirir
865 | *
866 | * @param $string
867 | * @return string
868 | */
869 | private function turkishChars($s)
870 | {
871 | $tr = array('ş', 'Ş', 'ı', 'I', 'İ', 'ğ', 'Ğ', 'ü', 'Ü', 'ö', 'Ö', 'Ç', 'ç', '(', ')', '/', ':', ',', '&', '"', "“", "”");
872 | $eng = array('s', 's', 'i', 'i', 'i', 'g', 'g', 'u', 'u', 'o', 'o', 'c', 'c', '', '', '-', '-', '', "", "", "");
873 | $s = str_replace($tr, $eng, $s);
874 | $s = strtolower($s);
875 | $s = preg_replace('/&amp;amp;amp;amp;amp;amp;amp;.+?;/', '', $s);
876 | $s = preg_replace('/\s+/', '-', $s);
877 | $s = preg_replace('|-+|', '-', $s);
878 | $s = preg_replace('/#/', '', $s);
879 | $s = trim($s, '-');
880 | return $s;
881 | }
882 |
883 | /**
884 | * Gereksiz boşlukları temizler.
885 | *
886 | * @param $string
887 | * @return string
888 | */
889 | private function replaceSpace($string)
890 | {
891 | $string = preg_replace("/\s+/", " ", $string);
892 | $string = trim($string);
893 | return $string;
894 | }
895 |
896 | /**
897 | * @param $url
898 | * @param null $proxy
899 | * @return mixed
900 | */
901 | private function Curl($url, $proxy = false)
902 | {
903 |
904 | $options = array(
905 | CURLOPT_RETURNTRANSFER => true,
906 | CURLOPT_HEADER => false,
907 | CURLOPT_ENCODING => "",
908 | CURLOPT_AUTOREFERER => true,
909 | CURLOPT_FOLLOWLOCATION => true,
910 | CURLOPT_CONNECTTIMEOUT => 30,
911 | CURLOPT_TIMEOUT => 30,
912 | CURLOPT_MAXREDIRS => 10,
913 | CURLOPT_SSL_VERIFYPEER => false,
914 |
915 | );
916 | if ($proxy == true) {
917 | $proxyList = json_decode(file_get_contents("proxy.json"),true);
918 | $p = rand(0, count($proxyList) - 1);
919 | array_push($options, array(CURLOPT_PROXY => $proxyList[$p]));
920 | }
921 |
922 | $ch = curl_init($url);
923 | curl_setopt_array($ch, $options);
924 | $content = curl_exec($ch);
925 | $err = curl_errno($ch);
926 | $errmsg = curl_error($ch);
927 | $header = curl_getinfo($ch);
928 | $redirectURL = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
929 | curl_close($ch);
930 | $header['errno'] = $err;
931 | $header['errmsg'] = $errmsg;
932 | $header['redirect'] = $redirectURL;
933 | $header['content'] = $content;
934 | if (empty($errmsg)) {
935 |
936 | return str_replace(array("\n", "\r", "\t"), NULL, $header['content']);
937 | } else {
938 | return array("error" => "code: " . $err . " message:" . $errmsg);
939 | }
940 | }
941 |
942 | /**
943 | * PHP Array formatını XML formata çevirir
944 | *
945 | * @param $array
946 | * @return XML
947 | */
948 | private function array_to_xml($array, &$xml_user_info)
949 | {
950 | foreach ($array as $key => $value) {
951 | if (is_array($value)) {
952 | if (!is_numeric($key)) {
953 | $subnode = $xml_user_info->addChild("$key");
954 | $this->array_to_xml($value, $subnode);
955 | } else {
956 | $subnode = $xml_user_info->addChild("item");
957 | $this->array_to_xml($value, $subnode);
958 | }
959 | } else {
960 | $xml_user_info->addChild("$key", htmlspecialchars("$value"));
961 | }
962 | }
963 | }
964 |
965 |
966 | }
967 |
--------------------------------------------------------------------------------
/class/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real" number of bytes the dom was created from.
18 | * but for most purposes, it's a really good estimation.
19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 | * Allow the user to tell us how much they trust the html.
21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 | * This allows for us to find tags based on the text they contain.
23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 | * Paperg: added parse_charset so that we know about the character set of the source document.
25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27 | *
28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30 | *
31 | * Licensed under The MIT License
32 | * Redistributions of files must retain the above copyright notice.
33 | *
34 | * @author S.C. Chen
35 | * @author John Schlick
36 | * @author Rus Carroll
37 | * @version 1.5 ($Rev: 210 $)
38 | * @package PlaceLocalInclude
39 | * @subpackage simple_html_dom
40 | */
41 |
42 | /**
43 | * All of the Defines for the classes below.
44 | * @author S.C. Chen
45 | */
46 | define('HDOM_TYPE_ELEMENT', 1);
47 | define('HDOM_TYPE_COMMENT', 2);
48 | define('HDOM_TYPE_TEXT', 3);
49 | define('HDOM_TYPE_ENDTAG', 4);
50 | define('HDOM_TYPE_ROOT', 5);
51 | define('HDOM_TYPE_UNKNOWN', 6);
52 | define('HDOM_QUOTE_DOUBLE', 0);
53 | define('HDOM_QUOTE_SINGLE', 1);
54 | define('HDOM_QUOTE_NO', 3);
55 | define('HDOM_INFO_BEGIN', 0);
56 | define('HDOM_INFO_END', 1);
57 | define('HDOM_INFO_QUOTE', 2);
58 | define('HDOM_INFO_SPACE', 3);
59 | define('HDOM_INFO_TEXT', 4);
60 | define('HDOM_INFO_INNER', 5);
61 | define('HDOM_INFO_OUTER', 6);
62 | define('HDOM_INFO_ENDSPACE',7);
63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
64 | define('DEFAULT_BR_TEXT', "\r\n");
65 | define('DEFAULT_SPAN_TEXT', " ");
66 | define('MAX_FILE_SIZE', 600000);
67 | // helper functions
68 | // -----------------------------------------------------------------------------
69 | // get html dom from file
70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
72 | {
73 | // We DO force the tags to be terminated.
74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 | $contents = file_get_contents($url, $use_include_path, $context, $offset);
77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 | //$contents = retrieve_url_contents($url);
79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80 | {
81 | return false;
82 | }
83 | // The second parameter can force the selectors to all be lowercase.
84 | $dom->load($contents, $lowercase, $stripRN);
85 | return $dom;
86 | }
87 |
88 | // get html dom from string
89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90 | {
91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93 | {
94 | $dom->clear();
95 | return false;
96 | }
97 | $dom->load($str, $lowercase, $stripRN);
98 | return $dom;
99 | }
100 |
101 | // dump html dom tree
102 | function dump_html_tree($node, $show_attr=true, $deep=0)
103 | {
104 | $node->dump($node);
105 | }
106 |
107 |
108 | /**
109 | * simple html dom node
110 | * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index
112 | *
113 | * @package PlaceLocalInclude
114 | */
115 | class simple_html_dom_node
116 | {
117 | public $nodetype = HDOM_TYPE_TEXT;
118 | public $tag = 'text';
119 | public $attr = array();
120 | public $children = array();
121 | public $nodes = array();
122 | public $parent = null;
123 | // The "info" array - see HDOM_INFO_... for what each element contains.
124 | public $_ = array();
125 | public $tag_start = 0;
126 | private $dom = null;
127 |
128 | function __construct($dom)
129 | {
130 | $this->dom = $dom;
131 | $dom->nodes[] = $this;
132 | }
133 |
134 | function __destruct()
135 | {
136 | $this->clear();
137 | }
138 |
139 | function __toString()
140 | {
141 | return $this->outertext();
142 | }
143 |
144 | // clean up memory due to php5 circular references memory leak...
145 | function clear()
146 | {
147 | $this->dom = null;
148 | $this->nodes = null;
149 | $this->parent = null;
150 | $this->children = null;
151 | }
152 |
153 | // dump node's tree
154 | function dump($show_attr=true, $deep=0)
155 | {
156 | $lead = str_repeat(' ', $deep);
157 |
158 | echo $lead.$this->tag;
159 | if ($show_attr && count($this->attr)>0)
160 | {
161 | echo '(';
162 | foreach ($this->attr as $k=>$v)
163 | echo "[$k]=>\"".$this->$k.'", ';
164 | echo ')';
165 | }
166 | echo "\n";
167 |
168 | if ($this->nodes)
169 | {
170 | foreach ($this->nodes as $c)
171 | {
172 | $c->dump($show_attr, $deep+1);
173 | }
174 | }
175 | }
176 |
177 |
178 | // Debugging function to dump a single dom node with a bunch of information about it.
179 | function dump_node($echo=true)
180 | {
181 |
182 | $string = $this->tag;
183 | if (count($this->attr)>0)
184 | {
185 | $string .= '(';
186 | foreach ($this->attr as $k=>$v)
187 | {
188 | $string .= "[$k]=>\"".$this->$k.'", ';
189 | }
190 | $string .= ')';
191 | }
192 | if (count($this->_)>0)
193 | {
194 | $string .= ' $_ (';
195 | foreach ($this->_ as $k=>$v)
196 | {
197 | if (is_array($v))
198 | {
199 | $string .= "[$k]=>(";
200 | foreach ($v as $k2=>$v2)
201 | {
202 | $string .= "[$k2]=>\"".$v2.'", ';
203 | }
204 | $string .= ")";
205 | } else {
206 | $string .= "[$k]=>\"".$v.'", ';
207 | }
208 | }
209 | $string .= ")";
210 | }
211 |
212 | if (isset($this->text))
213 | {
214 | $string .= " text: (" . $this->text . ")";
215 | }
216 |
217 | $string .= " HDOM_INNER_INFO: '";
218 | if (isset($node->_[HDOM_INFO_INNER]))
219 | {
220 | $string .= $node->_[HDOM_INFO_INNER] . "'";
221 | }
222 | else
223 | {
224 | $string .= ' NULL ';
225 | }
226 |
227 | $string .= " children: " . count($this->children);
228 | $string .= " nodes: " . count($this->nodes);
229 | $string .= " tag_start: " . $this->tag_start;
230 | $string .= "\n";
231 |
232 | if ($echo)
233 | {
234 | echo $string;
235 | return;
236 | }
237 | else
238 | {
239 | return $string;
240 | }
241 | }
242 |
243 | // returns the parent of node
244 | // If a node is passed in, it will reset the parent of the current node to that one.
245 | function parent($parent=null)
246 | {
247 | // I am SURE that this doesn't work properly.
248 | // It fails to unset the current node from it's current parents nodes or children list first.
249 | if ($parent !== null)
250 | {
251 | $this->parent = $parent;
252 | $this->parent->nodes[] = $this;
253 | $this->parent->children[] = $this;
254 | }
255 |
256 | return $this->parent;
257 | }
258 |
259 | // verify that node has children
260 | function has_child()
261 | {
262 | return !empty($this->children);
263 | }
264 |
265 | // returns children of node
266 | function children($idx=-1)
267 | {
268 | if ($idx===-1)
269 | {
270 | return $this->children;
271 | }
272 | if (isset($this->children[$idx]))
273 | {
274 | return $this->children[$idx];
275 | }
276 | return null;
277 | }
278 |
279 | // returns the first child of node
280 | function first_child()
281 | {
282 | if (count($this->children)>0)
283 | {
284 | return $this->children[0];
285 | }
286 | return null;
287 | }
288 |
289 | // returns the last child of node
290 | function last_child()
291 | {
292 | if (($count=count($this->children))>0)
293 | {
294 | return $this->children[$count-1];
295 | }
296 | return null;
297 | }
298 |
299 | // returns the next sibling of node
300 | function next_sibling()
301 | {
302 | if ($this->parent===null)
303 | {
304 | return null;
305 | }
306 |
307 | $idx = 0;
308 | $count = count($this->parent->children);
309 | while ($idx<$count && $this!==$this->parent->children[$idx])
310 | {
311 | ++$idx;
312 | }
313 | if (++$idx>=$count)
314 | {
315 | return null;
316 | }
317 | return $this->parent->children[$idx];
318 | }
319 |
320 | // returns the previous sibling of node
321 | function prev_sibling()
322 | {
323 | if ($this->parent===null) return null;
324 | $idx = 0;
325 | $count = count($this->parent->children);
326 | while ($idx<$count && $this!==$this->parent->children[$idx])
327 | ++$idx;
328 | if (--$idx<0) return null;
329 | return $this->parent->children[$idx];
330 | }
331 |
332 | // function to locate a specific ancestor tag in the path to the root.
333 | function find_ancestor_tag($tag)
334 | {
335 | global $debug_object;
336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
337 |
338 | // Start by including ourselves in the comparison.
339 | $returnDom = $this;
340 |
341 | while (!is_null($returnDom))
342 | {
343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
344 |
345 | if ($returnDom->tag == $tag)
346 | {
347 | break;
348 | }
349 | $returnDom = $returnDom->parent;
350 | }
351 | return $returnDom;
352 | }
353 |
354 | // get dom node's inner html
355 | function innertext()
356 | {
357 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
358 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
359 |
360 | $ret = '';
361 | foreach ($this->nodes as $n)
362 | $ret .= $n->outertext();
363 | return $ret;
364 | }
365 |
366 | // get dom node's outer text (with tag)
367 | function outertext()
368 | {
369 | global $debug_object;
370 | if (is_object($debug_object))
371 | {
372 | $text = '';
373 | if ($this->tag == 'text')
374 | {
375 | if (!empty($this->text))
376 | {
377 | $text = " with text: " . $this->text;
378 | }
379 | }
380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
381 | }
382 |
383 | if ($this->tag==='root') return $this->innertext();
384 |
385 | // trigger callback
386 | if ($this->dom && $this->dom->callback!==null)
387 | {
388 | call_user_func_array($this->dom->callback, array($this));
389 | }
390 |
391 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
392 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
393 |
394 | // render begin tag
395 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
396 | {
397 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
398 | } else {
399 | $ret = "";
400 | }
401 |
402 | // render inner text
403 | if (isset($this->_[HDOM_INFO_INNER]))
404 | {
405 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
406 | if ($this->tag != "br")
407 | {
408 | $ret .= $this->_[HDOM_INFO_INNER];
409 | }
410 | } else {
411 | if ($this->nodes)
412 | {
413 | foreach ($this->nodes as $n)
414 | {
415 | $ret .= $this->convert_text($n->outertext());
416 | }
417 | }
418 | }
419 |
420 | // render end tag
421 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
422 | $ret .= ''.$this->tag.'>';
423 | return $ret;
424 | }
425 |
426 | // get dom node's plain text
427 | function text()
428 | {
429 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
430 | switch ($this->nodetype)
431 | {
432 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
433 | case HDOM_TYPE_COMMENT: return '';
434 | case HDOM_TYPE_UNKNOWN: return '';
435 | }
436 | if (strcasecmp($this->tag, 'script')===0) return '';
437 | if (strcasecmp($this->tag, 'style')===0) return '';
438 |
439 | $ret = '';
440 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
441 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
442 | // WHY is this happening?
443 | if (!is_null($this->nodes))
444 | {
445 | foreach ($this->nodes as $n)
446 | {
447 | $ret .= $this->convert_text($n->text());
448 | }
449 |
450 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
451 | if ($this->tag == "span")
452 | {
453 | $ret .= $this->dom->default_span_text;
454 | }
455 |
456 |
457 | }
458 | return $ret;
459 | }
460 |
461 | function xmltext()
462 | {
463 | $ret = $this->innertext();
464 | $ret = str_ireplace('', '', $ret);
466 | return $ret;
467 | }
468 |
469 | // build node's text with tag
470 | function makeup()
471 | {
472 | // text, comment, unknown
473 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
474 |
475 | $ret = '<'.$this->tag;
476 | $i = -1;
477 |
478 | foreach ($this->attr as $key=>$val)
479 | {
480 | ++$i;
481 |
482 | // skip removed attribute
483 | if ($val===null || $val===false)
484 | continue;
485 |
486 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
487 | //no value attr: nowrap, checked selected...
488 | if ($val===true)
489 | $ret .= $key;
490 | else {
491 | switch ($this->_[HDOM_INFO_QUOTE][$i])
492 | {
493 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
494 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
495 | default: $quote = '';
496 | }
497 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
498 | }
499 | }
500 | $ret = $this->dom->restore_noise($ret);
501 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
502 | }
503 |
504 | // find elements by css selector
505 | //PaperG - added ability for find to lowercase the value of the selector.
506 | function find($selector, $idx=null, $lowercase=false)
507 | {
508 | $selectors = $this->parse_selector($selector);
509 | if (($count=count($selectors))===0) return array();
510 | $found_keys = array();
511 |
512 | // find each selector
513 | for ($c=0; $c<$count; ++$c)
514 | {
515 | // The change on the below line was documented on the sourceforge code tracker id 2788009
516 | // used to be: if (($levle=count($selectors[0]))===0) return array();
517 | if (($levle=count($selectors[$c]))===0) return array();
518 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
519 |
520 | $head = array($this->_[HDOM_INFO_BEGIN]=>1);
521 |
522 | // handle descendant selectors, no recursive!
523 | for ($l=0; $l<$levle; ++$l)
524 | {
525 | $ret = array();
526 | foreach ($head as $k=>$v)
527 | {
528 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
529 | //PaperG - Pass this optional parameter on to the seek function.
530 | $n->seek($selectors[$c][$l], $ret, $lowercase);
531 | }
532 | $head = $ret;
533 | }
534 |
535 | foreach ($head as $k=>$v)
536 | {
537 | if (!isset($found_keys[$k]))
538 | {
539 | $found_keys[$k] = 1;
540 | }
541 | }
542 | }
543 |
544 | // sort keys
545 | ksort($found_keys);
546 |
547 | $found = array();
548 | foreach ($found_keys as $k=>$v)
549 | $found[] = $this->dom->nodes[$k];
550 |
551 | // return nth-element or array
552 | if (is_null($idx)) return $found;
553 | else if ($idx<0) $idx = count($found) + $idx;
554 | return (isset($found[$idx])) ? $found[$idx] : null;
555 | }
556 |
557 | // seek for given conditions
558 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
559 | protected function seek($selector, &$ret, $lowercase=false)
560 | {
561 | global $debug_object;
562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
563 |
564 | list($tag, $key, $val, $exp, $no_key) = $selector;
565 |
566 | // xpath index
567 | if ($tag && $key && is_numeric($key))
568 | {
569 | $count = 0;
570 | foreach ($this->children as $c)
571 | {
572 | if ($tag==='*' || $tag===$c->tag) {
573 | if (++$count==$key) {
574 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
575 | return;
576 | }
577 | }
578 | }
579 | return;
580 | }
581 |
582 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
583 | if ($end==0) {
584 | $parent = $this->parent;
585 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
586 | $end -= 1;
587 | $parent = $parent->parent;
588 | }
589 | $end += $parent->_[HDOM_INFO_END];
590 | }
591 |
592 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
593 | $node = $this->dom->nodes[$i];
594 |
595 | $pass = true;
596 |
597 | if ($tag==='*' && !$key) {
598 | if (in_array($node, $this->children, true))
599 | $ret[$i] = 1;
600 | continue;
601 | }
602 |
603 | // compare tag
604 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
605 | // compare key
606 | if ($pass && $key) {
607 | if ($no_key) {
608 | if (isset($node->attr[$key])) $pass=false;
609 | } else {
610 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
611 | }
612 | }
613 | // compare value
614 | if ($pass && $key && $val && $val!=='*') {
615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
616 | if ($key == "plaintext") {
617 | // $node->plaintext actually returns $node->text();
618 | $nodeKeyValue = $node->text();
619 | } else {
620 | // this is a normal search, we want the value of that attribute of the tag.
621 | $nodeKeyValue = $node->attr[$key];
622 | }
623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
624 |
625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
626 | if ($lowercase) {
627 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
628 | } else {
629 | $check = $this->match($exp, $val, $nodeKeyValue);
630 | }
631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
632 |
633 | // handle multiple class
634 | if (!$check && strcasecmp($key, 'class')===0) {
635 | foreach (explode(' ',$node->attr[$key]) as $k) {
636 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
637 | if (!empty($k)) {
638 | if ($lowercase) {
639 | $check = $this->match($exp, strtolower($val), strtolower($k));
640 | } else {
641 | $check = $this->match($exp, $val, $k);
642 | }
643 | if ($check) break;
644 | }
645 | }
646 | }
647 | if (!$check) $pass = false;
648 | }
649 | if ($pass) $ret[$i] = 1;
650 | unset($node);
651 | }
652 | // It's passed by reference so this is actually what this function returns.
653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
654 | }
655 |
656 | protected function match($exp, $pattern, $value) {
657 | global $debug_object;
658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
659 |
660 | switch ($exp) {
661 | case '=':
662 | return ($value===$pattern);
663 | case '!=':
664 | return ($value!==$pattern);
665 | case '^=':
666 | return preg_match("/^".preg_quote($pattern,'/')."/", $value);
667 | case '$=':
668 | return preg_match("/".preg_quote($pattern,'/')."$/", $value);
669 | case '*=':
670 | if ($pattern[0]=='/') {
671 | return preg_match($pattern, $value);
672 | }
673 | return preg_match("/".$pattern."/i", $value);
674 | }
675 | return false;
676 | }
677 |
678 | protected function parse_selector($selector_string) {
679 | global $debug_object;
680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
681 |
682 | // pattern of CSS selectors, modified from mootools
683 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does.
684 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
685 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
687 | // farther study is required to determine of this should be documented or removed.
688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
692 |
693 | $selectors = array();
694 | $result = array();
695 | //print_r($matches);
696 |
697 | foreach ($matches as $m) {
698 | $m[0] = trim($m[0]);
699 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
700 | // for browser generated xpath
701 | if ($m[1]==='tbody') continue;
702 |
703 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
704 | if (!empty($m[2])) {$key='id'; $val=$m[2];}
705 | if (!empty($m[3])) {$key='class'; $val=$m[3];}
706 | if (!empty($m[4])) {$key=$m[4];}
707 | if (!empty($m[5])) {$exp=$m[5];}
708 | if (!empty($m[6])) {$val=$m[6];}
709 |
710 | // convert to lowercase
711 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
712 | //elements that do NOT have the specified attribute
713 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
714 |
715 | $result[] = array($tag, $key, $val, $exp, $no_key);
716 | if (trim($m[7])===',') {
717 | $selectors[] = $result;
718 | $result = array();
719 | }
720 | }
721 | if (count($result)>0)
722 | $selectors[] = $result;
723 | return $selectors;
724 | }
725 |
726 | function __get($name)
727 | {
728 | if (isset($this->attr[$name]))
729 | {
730 | return $this->convert_text($this->attr[$name]);
731 | }
732 | switch ($name)
733 | {
734 | case 'outertext': return $this->outertext();
735 | case 'innertext': return $this->innertext();
736 | case 'plaintext': return $this->text();
737 | case 'xmltext': return $this->xmltext();
738 | default: return array_key_exists($name, $this->attr);
739 | }
740 | }
741 |
742 | function __set($name, $value)
743 | {
744 | global $debug_object;
745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746 |
747 | switch ($name)
748 | {
749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
750 | case 'innertext':
751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
752 | return $this->_[HDOM_INFO_INNER] = $value;
753 | }
754 | if (!isset($this->attr[$name]))
755 | {
756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
758 | }
759 | $this->attr[$name] = $value;
760 | }
761 |
762 | function __isset($name)
763 | {
764 | switch ($name)
765 | {
766 | case 'outertext': return true;
767 | case 'innertext': return true;
768 | case 'plaintext': return true;
769 | }
770 | //no value attr: nowrap, checked selected...
771 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
772 | }
773 |
774 | function __unset($name) {
775 | if (isset($this->attr[$name]))
776 | unset($this->attr[$name]);
777 | }
778 |
779 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
780 | function convert_text($text)
781 | {
782 | global $debug_object;
783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
784 |
785 | $converted_text = $text;
786 |
787 | $sourceCharset = "";
788 | $targetCharset = "";
789 |
790 | if ($this->dom)
791 | {
792 | $sourceCharset = strtoupper($this->dom->_charset);
793 | $targetCharset = strtoupper($this->dom->_target_charset);
794 | }
795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
796 |
797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
798 | {
799 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
800 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
801 | {
802 | $converted_text = $text;
803 | }
804 | else
805 | {
806 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
807 | }
808 | }
809 |
810 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
811 | if ($targetCharset == 'UTF-8')
812 | {
813 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
814 | {
815 | $converted_text = substr($converted_text, 3);
816 | }
817 | if (substr($converted_text, -3) == "\xef\xbb\xbf")
818 | {
819 | $converted_text = substr($converted_text, 0, -3);
820 | }
821 | }
822 |
823 | return $converted_text;
824 | }
825 |
826 | /**
827 | * Returns true if $string is valid UTF-8 and false otherwise.
828 | *
829 | * @param mixed $str String to be tested
830 | * @return boolean
831 | */
832 | static function is_utf8($str)
833 | {
834 | $c=0; $b=0;
835 | $bits=0;
836 | $len=strlen($str);
837 | for($i=0; $i<$len; $i++)
838 | {
839 | $c=ord($str[$i]);
840 | if($c > 128)
841 | {
842 | if(($c >= 254)) return false;
843 | elseif($c >= 252) $bits=6;
844 | elseif($c >= 248) $bits=5;
845 | elseif($c >= 240) $bits=4;
846 | elseif($c >= 224) $bits=3;
847 | elseif($c >= 192) $bits=2;
848 | else return false;
849 | if(($i+$bits) > $len) return false;
850 | while($bits > 1)
851 | {
852 | $i++;
853 | $b=ord($str[$i]);
854 | if($b < 128 || $b > 191) return false;
855 | $bits--;
856 | }
857 | }
858 | }
859 | return true;
860 | }
861 | /*
862 | function is_utf8($string)
863 | {
864 | //this is buggy
865 | return (utf8_encode(utf8_decode($string)) == $string);
866 | }
867 | */
868 |
869 | /**
870 | * Function to try a few tricks to determine the displayed size of an img on the page.
871 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
872 | *
873 | * @author John Schlick
874 | * @version April 19 2012
875 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
876 | */
877 | function get_display_size()
878 | {
879 | global $debug_object;
880 |
881 | $width = -1;
882 | $height = -1;
883 |
884 | if ($this->tag !== 'img')
885 | {
886 | return false;
887 | }
888 |
889 | // See if there is aheight or width attribute in the tag itself.
890 | if (isset($this->attr['width']))
891 | {
892 | $width = $this->attr['width'];
893 | }
894 |
895 | if (isset($this->attr['height']))
896 | {
897 | $height = $this->attr['height'];
898 | }
899 |
900 | // Now look for an inline style.
901 | if (isset($this->attr['style']))
902 | {
903 | // Thanks to user gnarf from stackoverflow for this regular expression.
904 | $attributes = array();
905 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
906 | foreach ($matches as $match) {
907 | $attributes[$match[1]] = $match[2];
908 | }
909 |
910 | // If there is a width in the style attributes:
911 | if (isset($attributes['width']) && $width == -1)
912 | {
913 | // check that the last two characters are px (pixels)
914 | if (strtolower(substr($attributes['width'], -2)) == 'px')
915 | {
916 | $proposed_width = substr($attributes['width'], 0, -2);
917 | // Now make sure that it's an integer and not something stupid.
918 | if (filter_var($proposed_width, FILTER_VALIDATE_INT))
919 | {
920 | $width = $proposed_width;
921 | }
922 | }
923 | }
924 |
925 | // If there is a width in the style attributes:
926 | if (isset($attributes['height']) && $height == -1)
927 | {
928 | // check that the last two characters are px (pixels)
929 | if (strtolower(substr($attributes['height'], -2)) == 'px')
930 | {
931 | $proposed_height = substr($attributes['height'], 0, -2);
932 | // Now make sure that it's an integer and not something stupid.
933 | if (filter_var($proposed_height, FILTER_VALIDATE_INT))
934 | {
935 | $height = $proposed_height;
936 | }
937 | }
938 | }
939 |
940 | }
941 |
942 | // Future enhancement:
943 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
944 |
945 | // Far future enhancement
946 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
947 | // Note that in this case, the class or id will have the img subselector for it to apply to the image.
948 |
949 | // ridiculously far future development
950 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
951 |
952 | $result = array('height' => $height,
953 | 'width' => $width);
954 | return $result;
955 | }
956 |
957 | // camel naming conventions
958 | function getAllAttributes() {return $this->attr;}
959 | function getAttribute($name) {return $this->__get($name);}
960 | function setAttribute($name, $value) {$this->__set($name, $value);}
961 | function hasAttribute($name) {return $this->__isset($name);}
962 | function removeAttribute($name) {$this->__set($name, null);}
963 | function getElementById($id) {return $this->find("#$id", 0);}
964 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
965 | function getElementByTagName($name) {return $this->find($name, 0);}
966 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
967 | function parentNode() {return $this->parent();}
968 | function childNodes($idx=-1) {return $this->children($idx);}
969 | function firstChild() {return $this->first_child();}
970 | function lastChild() {return $this->last_child();}
971 | function nextSibling() {return $this->next_sibling();}
972 | function previousSibling() {return $this->prev_sibling();}
973 | function hasChildNodes() {return $this->has_child();}
974 | function nodeName() {return $this->tag;}
975 | function appendChild($node) {$node->parent($this); return $node;}
976 |
977 | }
978 |
979 | /**
980 | * simple html dom parser
981 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
982 | * Paperg - change $size from protected to public so we can easily access it
983 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
984 | *
985 | * @package PlaceLocalInclude
986 | */
987 | class simple_html_dom
988 | {
989 | public $root = null;
990 | public $nodes = array();
991 | public $callback = null;
992 | public $lowercase = false;
993 | // Used to keep track of how large the text was when we started.
994 | public $original_size;
995 | public $size;
996 | protected $pos;
997 | protected $doc;
998 | protected $char;
999 | protected $cursor;
1000 | protected $parent;
1001 | protected $noise = array();
1002 | protected $token_blank = " \t\r\n";
1003 | protected $token_equal = ' =/>';
1004 | protected $token_slash = " />\r\n\t";
1005 | protected $token_attr = ' >';
1006 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007 | public $_charset = '';
1008 | public $_target_charset = '';
1009 | protected $default_br_text = "";
1010 | public $default_span_text = "";
1011 |
1012 | // use isset instead of in_array, performance boost about 30%...
1013 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1014 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1015 | // Known sourceforge issue #2977341
1016 | // B tags that are not closed cause us to return everything to the end of the document.
1017 | protected $optional_closing_tags = array(
1018 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1019 | 'th'=>array('th'=>1),
1020 | 'td'=>array('td'=>1),
1021 | 'li'=>array('li'=>1),
1022 | 'dt'=>array('dt'=>1, 'dd'=>1),
1023 | 'dd'=>array('dd'=>1, 'dt'=>1),
1024 | 'dl'=>array('dd'=>1, 'dt'=>1),
1025 | 'p'=>array('p'=>1),
1026 | 'nobr'=>array('nobr'=>1),
1027 | 'b'=>array('b'=>1),
1028 | 'option'=>array('option'=>1),
1029 | );
1030 |
1031 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1032 | {
1033 | if ($str)
1034 | {
1035 | if (preg_match("/^http:\/\//i",$str) || is_file($str))
1036 | {
1037 | $this->load_file($str);
1038 | }
1039 | else
1040 | {
1041 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1042 | }
1043 | }
1044 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045 | if (!$forceTagsClosed) {
1046 | $this->optional_closing_array=array();
1047 | }
1048 | $this->_target_charset = $target_charset;
1049 | }
1050 |
1051 | function __destruct()
1052 | {
1053 | $this->clear();
1054 | }
1055 |
1056 | // load html from string
1057 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1058 | {
1059 | global $debug_object;
1060 |
1061 | // prepare
1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1063 | // strip out cdata
1064 | $this->remove_noise("''is", true);
1065 | // strip out comments
1066 | $this->remove_noise("''is");
1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068 | // Script tags removal now preceeds style tag removal.
1069 | // strip out