├── LICENSE
├── README.md
├── api.php
└── lib
├── .htaccess
├── bmkg.php
└── simple_html_dom.php
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Ican Bachors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BMKG-API-Web-Scrapping
2 | API untuk menampilkan prakiraan cuaca dan gempa di indonesia. Sumber dari website BMKG.
3 |
4 | DEMO
5 |
--------------------------------------------------------------------------------
/api.php:
--------------------------------------------------------------------------------
1 | cuaca();
12 |
13 | /* menampilkan informasi gempa
14 | $gempa = $bmkg->gempa();
15 | */
16 |
17 | // output JSON
18 | header('Content-Type: application/json');
19 | header('Access-Control-Allow-Origin: *');
20 | echo json_encode($cuaca, JSON_PRETTY_PRINT);
21 |
--------------------------------------------------------------------------------
/lib/.htaccess:
--------------------------------------------------------------------------------
1 | Deny from all
--------------------------------------------------------------------------------
/lib/bmkg.php:
--------------------------------------------------------------------------------
1 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0';
11 | $this->url = 'http://www.bmkg.go.id/BMKG_Pusat/';
12 |
13 | // include simple html dom
14 | require('simple_html_dom.php');
15 | }
16 |
17 | private function ayocurl($get)
18 | {
19 | $ch = curl_init();
20 | curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
21 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
22 | curl_setopt($ch, CURLOPT_REFERER, $this->url);
23 | curl_setopt($ch, CURLOPT_URL, $this->url . $get);
24 | if (!$html = curl_exec($ch)) {
25 | return 'offline';
26 | } else {
27 | curl_close($ch);
28 | return $html;
29 | }
30 | }
31 |
32 | private function latlng($kota, $find)
33 | {
34 | $data = array(
35 | "Banda_Aceh" => array(
36 | "lat" => 5.5482904,
37 | "lng" => 95.3237559
38 | ),
39 | "Medan" => array(
40 | "lat" => 3.5951956,
41 | "lng" => 98.6722227
42 | ),
43 | "Pekanbaru" => array(
44 | "lat" => 0.5070677,
45 | "lng" => 101.4477793
46 | ),
47 | "Batam" => array(
48 | "lat" => 1.0456264,
49 | "lng" => 104.0304535
50 | ),
51 | "Padang" => array(
52 | "lat" => -0.9470832,
53 | "lng" => 100.417181
54 | ),
55 | "Jambi" => array(
56 | "lat" => -1.6101229,
57 | "lng" => 103.6131203
58 | ),
59 | "Palembang" => array(
60 | "lat" => -2.9760735,
61 | "lng" => 104.7754307
62 | ),
63 | "Pangkal_Pinang" => array(
64 | "lat" => -2.1316266,
65 | "lng" => 106.1169299
66 | ),
67 | "Bengkulu" => array(
68 | "lat" => -3.7928451,
69 | "lng" => 102.2607641
70 | ),
71 | "Bandar_Lampung" => array(
72 | "lat" => -5.3971396,
73 | "lng" => 105.2667887
74 | ),
75 | "Pontianak" => array(
76 | "lat" => -0.0263303,
77 | "lng" => 109.3425039
78 | ),
79 | "Samarinda" => array(
80 | "lat" => -0.4948232,
81 | "lng" => 117.1436154
82 | ),
83 | "Palangkaraya" => array(
84 | "lat" => -2.2161048,
85 | "lng" => 113.913977
86 | ),
87 | "Banjarmasin" => array(
88 | "lat" => -3.3186067,
89 | "lng" => 114.5943784
90 | ),
91 | "Manado" => array(
92 | "lat" => 1.4748305,
93 | "lng" => 124.8420794
94 | ),
95 | "Gorontalo" => array(
96 | "lat" => 0.5435442,
97 | "lng" => 123.0567693
98 | ),
99 | "Palu" => array(
100 | "lat" => -0.9002915,
101 | "lng" => 119.8779987
102 | ),
103 | "Kendari" => array(
104 | "lat" => -3.9984597,
105 | "lng" => 122.5129742
106 | ),
107 | "Makassar" => array(
108 | "lat" => -5.1476651,
109 | "lng" => 119.4327314
110 | ),
111 | "Majene" => array(
112 | "lat" => -3.0297251,
113 | "lng" => 118.9062794
114 | ),
115 | "Ternate" => array(
116 | "lat" => 0.7898868,
117 | "lng" => 127.3753792
118 | ),
119 | "Ambon" => array(
120 | "lat" => -3.6553932,
121 | "lng" => 128.1907723
122 | ),
123 | "Jayapura" => array(
124 | "lat" => -2.5916025,
125 | "lng" => 140.6689995
126 | ),
127 | "Sorong" => array(
128 | "lat" => -0.8819986,
129 | "lng" => 131.2954834
130 | ),
131 | "Biak" => array(
132 | "lat" => -1.0381022,
133 | "lng" => 135.9800848
134 | ),
135 | "Manokwari" => array(
136 | "lat" => -0.8614531,
137 | "lng" => 134.0620421
138 | ),
139 | "Merauke" => array(
140 | "lat" => -8.4991117,
141 | "lng" => 140.4049814
142 | ),
143 | "Kupang" => array(
144 | "lat" => -10.1771997,
145 | "lng" => 123.6070329
146 | ),
147 | "Sumbawa_Besar" => array(
148 | "lat" => -8.504043,
149 | "lng" => 117.428497
150 | ),
151 | "Mataram" => array(
152 | "lat" => -8.5769951,
153 | "lng" => 116.1004894
154 | ),
155 | "Denpasar" => array(
156 | "lat" => -8.6704582,
157 | "lng" => 115.2126293
158 | ),
159 | "Jakarta" => array(
160 | "lat" => -6.2087634,
161 | "lng" => 106.845599
162 | ),
163 | "Serang" => array(
164 | "lat" => -6.1103661,
165 | "lng" => 106.1639749
166 | ),
167 | "Bandung" => array(
168 | "lat" => -6.9174639,
169 | "lng" => 107.6191228
170 | ),
171 | "Semarang" => array(
172 | "lat" => -7.0051453,
173 | "lng" => 110.4381254
174 | ),
175 | "Yogyakarta" => array(
176 | "lat" => -7.7955798,
177 | "lng" => 110.3694896
178 | ),
179 | "Surabaya" => array(
180 | "lat" => -7.2574719,
181 | "lng" => 112.7520883
182 | )
183 | );
184 |
185 | $kota2 = str_replace(" ", "_", $kota);
186 | return $data[$kota2][$find];
187 | }
188 |
189 | function cuaca()
190 | {
191 | $data = $this->ayocurl('Informasi_Cuaca/Prakiraan_Cuaca/Prakiraan_Cuaca_Indonesia.bmkg');
192 |
193 | $result = array();
194 |
195 | if ($data == "offline") {
196 | $result['status'] = "error";
197 | $result['message'] = "offline";
198 | } else {
199 | $result['status'] = "success";
200 | $result['view'] = "cuaca";
201 | $html = str_get_html($data);
202 | $table = $html->find('table[class=table-hover]', 0);
203 |
204 | $sekarang = explode("Ini", $table->find('th', 1)->innertext);
205 | $besok = explode("Hari", $table->find('th', 2)->innertext);
206 |
207 | foreach ($table->find('tr') as $i=>$tr) {
208 | if ($i != 0) {
209 |
210 | $kota = $tr->find('td', 0)->innertext;
211 |
212 | $cuaca_sekarang = explode("Suhu : ", $tr->find('td', 1)->innertext);
213 | $suhu_sekarang = explode("Kelembaban : ", $cuaca_sekarang[1]);
214 | $suhu_sekarang_minmax = explode(" - ", $suhu_sekarang[0]);
215 | $kelembaban_sekarang = $suhu_sekarang[1];
216 | $kelembaban_sekarang_minmax = explode(" - ", $kelembaban_sekarang);
217 |
218 | $cuaca_besok = explode("Suhu : ", $tr->find('td', 2)->innertext);
219 | $suhu_besok = explode("Kelembaban : ", $cuaca_besok[1]);
220 | $suhu_besok_minmax = explode(" - ", $suhu_besok[0]);
221 | $kelembaban_besok = $suhu_besok[1];
222 | $kelembaban_besok_minmax = explode(" - ", $kelembaban_besok);
223 |
224 | $cells = array(
225 | 'kota' => strip_tags($kota),
226 | 'maps' => array(
227 | 'latitude' => strip_tags($this->latlng($kota, 'lat')),
228 | 'longitude' => strip_tags($this->latlng($kota, 'lng'))
229 | ),
230 | 'prakiraan' => array(
231 | 'sekarang' => array(
232 | 'tgl' => strip_tags($sekarang[1]),
233 | 'cuaca' => strip_tags($cuaca_sekarang[0]),
234 | 'suhu' => array(
235 | 'min' => strip_tags($suhu_sekarang_minmax[0]),
236 | 'max' => '' . strip_tags(intval($suhu_sekarang_minmax[1])) . ''
237 | ),
238 | 'kelembaban' => array(
239 | 'min' => strip_tags($kelembaban_sekarang_minmax[0]),
240 | 'max' => strip_tags(str_replace(" %", "", $kelembaban_sekarang_minmax[1]))
241 | )
242 | ),
243 | 'besok' => array(
244 | 'tgl' => strip_tags($besok[1]),
245 | 'cuaca' => strip_tags($cuaca_besok[0]),
246 | 'suhu' => array(
247 | 'min' => strip_tags($suhu_besok_minmax[0]),
248 | 'max' => '' . strip_tags(intval($suhu_besok_minmax[1])) . ''
249 | ),
250 | 'kelembaban' => array(
251 | 'min' => strip_tags($kelembaban_besok_minmax[0]),
252 | 'max' => strip_tags(str_replace(" %", "", $kelembaban_besok_minmax[1]))
253 | )
254 | )
255 | )
256 | );
257 | $result['data'][] = $cells;
258 | }
259 | }
260 | }
261 |
262 | return $result;
263 | }
264 |
265 | function gempa()
266 | {
267 | $data = $this->ayocurl('Gempabumi_-_Tsunami/Gempabumi/Gempabumi_Dirasakan.bmkg');
268 |
269 | $result = array();
270 |
271 | if ($data == "offline") {
272 | $result['status'] = "error";
273 | $result['message'] = "offline";
274 | } else {
275 | $result['status'] = "success";
276 | $result['view'] = "gempa";
277 | $html = str_get_html($data);
278 | $table = $html->find('table[class=table-hover]', 0);
279 |
280 | $i = 0;
281 | foreach ($table->find('tr') as $tr) {
282 | if ($i != 0) {
283 |
284 | $tgl = $tr->find('td', 1)->innertext;
285 | $waktu = $tr->find('td', 2)->innertext;
286 | $lintang = $tr->find('td', 3)->innertext;
287 | $magnitudo = $tr->find('td', 4)->innertext;
288 | $kedalaman = $tr->find('td', 5)->innertext;
289 | $ex = explode('
', $tr->find('td', 6)->innertext);
290 | $lokasi = $ex[0];
291 | $dirasakan = $tr->find('span[class=label-warning]', 0)->innertext;
292 | $img = $tr->find('img', 0)->src;
293 |
294 | $cells = array(
295 | 'tgl' => strip_tags($tgl),
296 | 'waktu' => strip_tags($waktu),
297 | 'lintang_bujur' => strip_tags($lintang),
298 | 'magnitudo' => strip_tags($magnitudo),
299 | 'kedalaman' => strip_tags($kedalaman),
300 | 'lokasi' => strip_tags($lokasi),
301 | 'dirasakan' => strip_tags($dirasakan),
302 | 'img' => strip_tags($img)
303 | );
304 | $result['data'][] = $cells;
305 | }
306 | $i++;
307 | }
308 | }
309 |
310 | return $result;
311 | }
312 |
313 | }
314 |
315 | ?>
316 |
--------------------------------------------------------------------------------
/lib/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real" number of bytes the dom was created from.
17 | * but for most purposes, it's a really good estimation.
18 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
19 | * Allow the user to tell us how much they trust the html.
20 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
21 | * This allows for us to find tags based on the text they contain.
22 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
23 | * Paperg: added parse_charset so that we know about the character set of the source document.
24 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
25 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
26 | *
27 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
28 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
29 | *
30 | * Licensed under The MIT License
31 | * Redistributions of files must retain the above copyright notice.
32 | *
33 | * @author S.C. Chen
34 | * @author John Schlick
35 | * @author Rus Carroll
36 | * @version 1.5 ($Rev: 196 $)
37 | * @package PlaceLocalInclude
38 | * @subpackage simple_html_dom
39 | */
40 |
41 | /**
42 | * All of the Defines for the classes below.
43 | * @author S.C. Chen
44 | */
45 | define('HDOM_TYPE_ELEMENT', 1);
46 | define('HDOM_TYPE_COMMENT', 2);
47 | define('HDOM_TYPE_TEXT', 3);
48 | define('HDOM_TYPE_ENDTAG', 4);
49 | define('HDOM_TYPE_ROOT', 5);
50 | define('HDOM_TYPE_UNKNOWN', 6);
51 | define('HDOM_QUOTE_DOUBLE', 0);
52 | define('HDOM_QUOTE_SINGLE', 1);
53 | define('HDOM_QUOTE_NO', 3);
54 | define('HDOM_INFO_BEGIN', 0);
55 | define('HDOM_INFO_END', 1);
56 | define('HDOM_INFO_QUOTE', 2);
57 | define('HDOM_INFO_SPACE', 3);
58 | define('HDOM_INFO_TEXT', 4);
59 | define('HDOM_INFO_INNER', 5);
60 | define('HDOM_INFO_OUTER', 6);
61 | define('HDOM_INFO_ENDSPACE',7);
62 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
63 | define('DEFAULT_BR_TEXT', "\r\n");
64 | define('DEFAULT_SPAN_TEXT', " ");
65 | define('MAX_FILE_SIZE', 600000);
66 | // helper functions
67 | // -----------------------------------------------------------------------------
68 | // get html dom from file
69 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
70 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
71 | {
72 | // We DO force the tags to be terminated.
73 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
74 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
75 |
76 | $contents = file_get_contents($url, $use_include_path, $context, $offset);
77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 | //$contents = retrieve_url_contents($url);
79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80 | {
81 | return false;
82 | }
83 | // The second parameter can force the selectors to all be lowercase.
84 | $dom->load($contents, $lowercase, $stripRN);
85 | return $dom;
86 | }
87 |
88 | // get html dom from string
89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90 | {
91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93 | {
94 | $dom->clear();
95 | return false;
96 | }
97 | $dom->load($str, $lowercase, $stripRN);
98 | return $dom;
99 | }
100 |
101 | // dump html dom tree
102 | function dump_html_tree($node, $show_attr=true, $deep=0)
103 | {
104 | $node->dump($node);
105 | }
106 |
107 |
108 | /**
109 | * simple html dom node
110 | * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index
112 | *
113 | * @package PlaceLocalInclude
114 | */
115 | class simple_html_dom_node
116 | {
117 | public $nodetype = HDOM_TYPE_TEXT;
118 | public $tag = 'text';
119 | public $attr = array();
120 | public $children = array();
121 | public $nodes = array();
122 | public $parent = null;
123 | // The "info" array - see HDOM_INFO_... for what each element contains.
124 | public $_ = array();
125 | public $tag_start = 0;
126 | private $dom = null;
127 |
128 | function __construct($dom)
129 | {
130 | $this->dom = $dom;
131 | $dom->nodes[] = $this;
132 | }
133 |
134 | function __destruct()
135 | {
136 | $this->clear();
137 | }
138 |
139 | function __toString()
140 | {
141 | return $this->outertext();
142 | }
143 |
144 | // clean up memory due to php5 circular references memory leak...
145 | function clear()
146 | {
147 | $this->dom = null;
148 | $this->nodes = null;
149 | $this->parent = null;
150 | $this->children = null;
151 | }
152 |
153 | // dump node's tree
154 | function dump($show_attr=true, $deep=0)
155 | {
156 | $lead = str_repeat(' ', $deep);
157 |
158 | echo $lead.$this->tag;
159 | if ($show_attr && count($this->attr)>0)
160 | {
161 | echo '(';
162 | foreach ($this->attr as $k=>$v)
163 | echo "[$k]=>\"".$this->$k.'", ';
164 | echo ')';
165 | }
166 | echo "\n";
167 |
168 | if ($this->nodes)
169 | {
170 | foreach ($this->nodes as $c)
171 | {
172 | $c->dump($show_attr, $deep+1);
173 | }
174 | }
175 | }
176 |
177 |
178 | // Debugging function to dump a single dom node with a bunch of information about it.
179 | function dump_node($echo=true)
180 | {
181 |
182 | $string = $this->tag;
183 | if (count($this->attr)>0)
184 | {
185 | $string .= '(';
186 | foreach ($this->attr as $k=>$v)
187 | {
188 | $string .= "[$k]=>\"".$this->$k.'", ';
189 | }
190 | $string .= ')';
191 | }
192 | if (count($this->_)>0)
193 | {
194 | $string .= ' $_ (';
195 | foreach ($this->_ as $k=>$v)
196 | {
197 | if (is_array($v))
198 | {
199 | $string .= "[$k]=>(";
200 | foreach ($v as $k2=>$v2)
201 | {
202 | $string .= "[$k2]=>\"".$v2.'", ';
203 | }
204 | $string .= ")";
205 | } else {
206 | $string .= "[$k]=>\"".$v.'", ';
207 | }
208 | }
209 | $string .= ")";
210 | }
211 |
212 | if (isset($this->text))
213 | {
214 | $string .= " text: (" . $this->text . ")";
215 | }
216 |
217 | $string .= " HDOM_INNER_INFO: '";
218 | if (isset($node->_[HDOM_INFO_INNER]))
219 | {
220 | $string .= $node->_[HDOM_INFO_INNER] . "'";
221 | }
222 | else
223 | {
224 | $string .= ' NULL ';
225 | }
226 |
227 | $string .= " children: " . count($this->children);
228 | $string .= " nodes: " . count($this->nodes);
229 | $string .= " tag_start: " . $this->tag_start;
230 | $string .= "\n";
231 |
232 | if ($echo)
233 | {
234 | echo $string;
235 | return;
236 | }
237 | else
238 | {
239 | return $string;
240 | }
241 | }
242 |
243 | // returns the parent of node
244 | // If a node is passed in, it will reset the parent of the current node to that one.
245 | function parent($parent=null)
246 | {
247 | // I am SURE that this doesn't work properly.
248 | // It fails to unset the current node from it's current parents nodes or children list first.
249 | if ($parent !== null)
250 | {
251 | $this->parent = $parent;
252 | $this->parent->nodes[] = $this;
253 | $this->parent->children[] = $this;
254 | }
255 |
256 | return $this->parent;
257 | }
258 |
259 | // verify that node has children
260 | function has_child()
261 | {
262 | return !empty($this->children);
263 | }
264 |
265 | // returns children of node
266 | function children($idx=-1)
267 | {
268 | if ($idx===-1)
269 | {
270 | return $this->children;
271 | }
272 | if (isset($this->children[$idx])) return $this->children[$idx];
273 | return null;
274 | }
275 |
276 | // returns the first child of node
277 | function first_child()
278 | {
279 | if (count($this->children)>0)
280 | {
281 | return $this->children[0];
282 | }
283 | return null;
284 | }
285 |
286 | // returns the last child of node
287 | function last_child()
288 | {
289 | if (($count=count($this->children))>0)
290 | {
291 | return $this->children[$count-1];
292 | }
293 | return null;
294 | }
295 |
296 | // returns the next sibling of node
297 | function next_sibling()
298 | {
299 | if ($this->parent===null)
300 | {
301 | return null;
302 | }
303 |
304 | $idx = 0;
305 | $count = count($this->parent->children);
306 | while ($idx<$count && $this!==$this->parent->children[$idx])
307 | {
308 | ++$idx;
309 | }
310 | if (++$idx>=$count)
311 | {
312 | return null;
313 | }
314 | return $this->parent->children[$idx];
315 | }
316 |
317 | // returns the previous sibling of node
318 | function prev_sibling()
319 | {
320 | if ($this->parent===null) return null;
321 | $idx = 0;
322 | $count = count($this->parent->children);
323 | while ($idx<$count && $this!==$this->parent->children[$idx])
324 | ++$idx;
325 | if (--$idx<0) return null;
326 | return $this->parent->children[$idx];
327 | }
328 |
329 | // function to locate a specific ancestor tag in the path to the root.
330 | function find_ancestor_tag($tag)
331 | {
332 | global $debugObject;
333 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
334 |
335 | // Start by including ourselves in the comparison.
336 | $returnDom = $this;
337 |
338 | while (!is_null($returnDom))
339 | {
340 | if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); }
341 |
342 | if ($returnDom->tag == $tag)
343 | {
344 | break;
345 | }
346 | $returnDom = $returnDom->parent;
347 | }
348 | return $returnDom;
349 | }
350 |
351 | // get dom node's inner html
352 | function innertext()
353 | {
354 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
355 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
356 |
357 | $ret = '';
358 | foreach ($this->nodes as $n)
359 | $ret .= $n->outertext();
360 | return $ret;
361 | }
362 |
363 | // get dom node's outer text (with tag)
364 | function outertext()
365 | {
366 | global $debugObject;
367 | if (is_object($debugObject))
368 | {
369 | $text = '';
370 | if ($this->tag == 'text')
371 | {
372 | if (!empty($this->text))
373 | {
374 | $text = " with text: " . $this->text;
375 | }
376 | }
377 | $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
378 | }
379 |
380 | if ($this->tag==='root') return $this->innertext();
381 |
382 | // trigger callback
383 | if ($this->dom && $this->dom->callback!==null)
384 | {
385 | call_user_func_array($this->dom->callback, array($this));
386 | }
387 |
388 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
389 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390 |
391 | // render begin tag
392 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
393 | {
394 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
395 | } else {
396 | $ret = "";
397 | }
398 |
399 | // render inner text
400 | if (isset($this->_[HDOM_INFO_INNER]))
401 | {
402 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
403 | if ($this->tag != "br")
404 | {
405 | $ret .= $this->_[HDOM_INFO_INNER];
406 | }
407 | } else {
408 | if ($this->nodes)
409 | {
410 | foreach ($this->nodes as $n)
411 | {
412 | $ret .= $this->convert_text($n->outertext());
413 | }
414 | }
415 | }
416 |
417 | // render end tag
418 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
419 | $ret .= ''.$this->tag.'>';
420 | return $ret;
421 | }
422 |
423 | // get dom node's plain text
424 | function text()
425 | {
426 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
427 | switch ($this->nodetype)
428 | {
429 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
430 | case HDOM_TYPE_COMMENT: return '';
431 | case HDOM_TYPE_UNKNOWN: return '';
432 | }
433 | if (strcasecmp($this->tag, 'script')===0) return '';
434 | if (strcasecmp($this->tag, 'style')===0) return '';
435 |
436 | $ret = '';
437 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
438 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
439 | // WHY is this happening?
440 | if (!is_null($this->nodes))
441 | {
442 | foreach ($this->nodes as $n)
443 | {
444 | $ret .= $this->convert_text($n->text());
445 | }
446 |
447 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
448 | if ($this->tag == "span")
449 | {
450 | $ret .= $this->dom->default_span_text;
451 | }
452 |
453 |
454 | }
455 | return $ret;
456 | }
457 |
458 | function xmltext()
459 | {
460 | $ret = $this->innertext();
461 | $ret = str_ireplace('', '', $ret);
463 | return $ret;
464 | }
465 |
466 | // build node's text with tag
467 | function makeup()
468 | {
469 | // text, comment, unknown
470 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471 |
472 | $ret = '<'.$this->tag;
473 | $i = -1;
474 |
475 | foreach ($this->attr as $key=>$val)
476 | {
477 | ++$i;
478 |
479 | // skip removed attribute
480 | if ($val===null || $val===false)
481 | continue;
482 |
483 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
484 | //no value attr: nowrap, checked selected...
485 | if ($val===true)
486 | $ret .= $key;
487 | else {
488 | switch ($this->_[HDOM_INFO_QUOTE][$i])
489 | {
490 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492 | default: $quote = '';
493 | }
494 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
495 | }
496 | }
497 | $ret = $this->dom->restore_noise($ret);
498 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
499 | }
500 |
501 | // find elements by css selector
502 | //PaperG - added ability for find to lowercase the value of the selector.
503 | function find($selector, $idx=null, $lowercase=false)
504 | {
505 | $selectors = $this->parse_selector($selector);
506 | if (($count=count($selectors))===0) return array();
507 | $found_keys = array();
508 |
509 | // find each selector
510 | for ($c=0; $c<$count; ++$c)
511 | {
512 | // The change on the below line was documented on the sourceforge code tracker id 2788009
513 | // used to be: if (($levle=count($selectors[0]))===0) return array();
514 | if (($levle=count($selectors[$c]))===0) return array();
515 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
516 |
517 | $head = array($this->_[HDOM_INFO_BEGIN]=>1);
518 |
519 | // handle descendant selectors, no recursive!
520 | for ($l=0; $l<$levle; ++$l)
521 | {
522 | $ret = array();
523 | foreach ($head as $k=>$v)
524 | {
525 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
526 | //PaperG - Pass this optional parameter on to the seek function.
527 | $n->seek($selectors[$c][$l], $ret, $lowercase);
528 | }
529 | $head = $ret;
530 | }
531 |
532 | foreach ($head as $k=>$v)
533 | {
534 | if (!isset($found_keys[$k]))
535 | $found_keys[$k] = 1;
536 | }
537 | }
538 |
539 | // sort keys
540 | ksort($found_keys);
541 |
542 | $found = array();
543 | foreach ($found_keys as $k=>$v)
544 | $found[] = $this->dom->nodes[$k];
545 |
546 | // return nth-element or array
547 | if (is_null($idx)) return $found;
548 | else if ($idx<0) $idx = count($found) + $idx;
549 | return (isset($found[$idx])) ? $found[$idx] : null;
550 | }
551 |
552 | // seek for given conditions
553 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
554 | protected function seek($selector, &$ret, $lowercase=false)
555 | {
556 | global $debugObject;
557 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
558 |
559 | list($tag, $key, $val, $exp, $no_key) = $selector;
560 |
561 | // xpath index
562 | if ($tag && $key && is_numeric($key))
563 | {
564 | $count = 0;
565 | foreach ($this->children as $c)
566 | {
567 | if ($tag==='*' || $tag===$c->tag) {
568 | if (++$count==$key) {
569 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
570 | return;
571 | }
572 | }
573 | }
574 | return;
575 | }
576 |
577 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
578 | if ($end==0) {
579 | $parent = $this->parent;
580 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
581 | $end -= 1;
582 | $parent = $parent->parent;
583 | }
584 | $end += $parent->_[HDOM_INFO_END];
585 | }
586 |
587 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
588 | $node = $this->dom->nodes[$i];
589 |
590 | $pass = true;
591 |
592 | if ($tag==='*' && !$key) {
593 | if (in_array($node, $this->children, true))
594 | $ret[$i] = 1;
595 | continue;
596 | }
597 |
598 | // compare tag
599 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
600 | // compare key
601 | if ($pass && $key) {
602 | if ($no_key) {
603 | if (isset($node->attr[$key])) $pass=false;
604 | } else {
605 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
606 | }
607 | }
608 | // compare value
609 | if ($pass && $key && $val && $val!=='*') {
610 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
611 | if ($key == "plaintext") {
612 | // $node->plaintext actually returns $node->text();
613 | $nodeKeyValue = $node->text();
614 | } else {
615 | // this is a normal search, we want the value of that attribute of the tag.
616 | $nodeKeyValue = $node->attr[$key];
617 | }
618 | if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619 |
620 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 | if ($lowercase) {
622 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
623 | } else {
624 | $check = $this->match($exp, $val, $nodeKeyValue);
625 | }
626 | if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
627 |
628 | // handle multiple class
629 | if (!$check && strcasecmp($key, 'class')===0) {
630 | foreach (explode(' ',$node->attr[$key]) as $k) {
631 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
632 | if (!empty($k)) {
633 | if ($lowercase) {
634 | $check = $this->match($exp, strtolower($val), strtolower($k));
635 | } else {
636 | $check = $this->match($exp, $val, $k);
637 | }
638 | if ($check) break;
639 | }
640 | }
641 | }
642 | if (!$check) $pass = false;
643 | }
644 | if ($pass) $ret[$i] = 1;
645 | unset($node);
646 | }
647 | // It's passed by reference so this is actually what this function returns.
648 | if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
649 | }
650 |
651 | protected function match($exp, $pattern, $value) {
652 | global $debugObject;
653 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
654 |
655 | switch ($exp) {
656 | case '=':
657 | return ($value===$pattern);
658 | case '!=':
659 | return ($value!==$pattern);
660 | case '^=':
661 | return preg_match("/^".preg_quote($pattern,'/')."/", $value);
662 | case '$=':
663 | return preg_match("/".preg_quote($pattern,'/')."$/", $value);
664 | case '*=':
665 | if ($pattern[0]=='/') {
666 | return preg_match($pattern, $value);
667 | }
668 | return preg_match("/".$pattern."/i", $value);
669 | }
670 | return false;
671 | }
672 |
673 | protected function parse_selector($selector_string) {
674 | global $debugObject;
675 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
676 |
677 | // pattern of CSS selectors, modified from mootools
678 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does.
679 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
680 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
681 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
682 | // farther study is required to determine of this should be documented or removed.
683 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 | if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
687 |
688 | $selectors = array();
689 | $result = array();
690 | //print_r($matches);
691 |
692 | foreach ($matches as $m) {
693 | $m[0] = trim($m[0]);
694 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
695 | // for browser generated xpath
696 | if ($m[1]==='tbody') continue;
697 |
698 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
699 | if (!empty($m[2])) {$key='id'; $val=$m[2];}
700 | if (!empty($m[3])) {$key='class'; $val=$m[3];}
701 | if (!empty($m[4])) {$key=$m[4];}
702 | if (!empty($m[5])) {$exp=$m[5];}
703 | if (!empty($m[6])) {$val=$m[6];}
704 |
705 | // convert to lowercase
706 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
707 | //elements that do NOT have the specified attribute
708 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
709 |
710 | $result[] = array($tag, $key, $val, $exp, $no_key);
711 | if (trim($m[7])===',') {
712 | $selectors[] = $result;
713 | $result = array();
714 | }
715 | }
716 | if (count($result)>0)
717 | $selectors[] = $result;
718 | return $selectors;
719 | }
720 |
721 | function __get($name) {
722 | if (isset($this->attr[$name]))
723 | {
724 | return $this->convert_text($this->attr[$name]);
725 | }
726 | switch ($name) {
727 | case 'outertext': return $this->outertext();
728 | case 'innertext': return $this->innertext();
729 | case 'plaintext': return $this->text();
730 | case 'xmltext': return $this->xmltext();
731 | default: return array_key_exists($name, $this->attr);
732 | }
733 | }
734 |
735 | function __set($name, $value) {
736 | switch ($name) {
737 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 | case 'innertext':
739 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 | return $this->_[HDOM_INFO_INNER] = $value;
741 | }
742 | if (!isset($this->attr[$name])) {
743 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 | }
746 | $this->attr[$name] = $value;
747 | }
748 |
749 | function __isset($name) {
750 | switch ($name) {
751 | case 'outertext': return true;
752 | case 'innertext': return true;
753 | case 'plaintext': return true;
754 | }
755 | //no value attr: nowrap, checked selected...
756 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
757 | }
758 |
759 | function __unset($name) {
760 | if (isset($this->attr[$name]))
761 | unset($this->attr[$name]);
762 | }
763 |
764 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
765 | function convert_text($text)
766 | {
767 | global $debugObject;
768 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
769 |
770 | $converted_text = $text;
771 |
772 | $sourceCharset = "";
773 | $targetCharset = "";
774 |
775 | if ($this->dom)
776 | {
777 | $sourceCharset = strtoupper($this->dom->_charset);
778 | $targetCharset = strtoupper($this->dom->_target_charset);
779 | }
780 | if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781 |
782 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 | {
784 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
785 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
786 | {
787 | $converted_text = $text;
788 | }
789 | else
790 | {
791 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
792 | }
793 | }
794 |
795 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
796 | if ($targetCharset == 'UTF-8')
797 | {
798 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
799 | {
800 | $converted_text = substr($converted_text, 3);
801 | }
802 | if (substr($converted_text, -3) == "\xef\xbb\xbf")
803 | {
804 | $converted_text = substr($converted_text, 0, -3);
805 | }
806 | }
807 |
808 | return $converted_text;
809 | }
810 |
811 | /**
812 | * Returns true if $string is valid UTF-8 and false otherwise.
813 | *
814 | * @param mixed $str String to be tested
815 | * @return boolean
816 | */
817 | static function is_utf8($str)
818 | {
819 | $c=0; $b=0;
820 | $bits=0;
821 | $len=strlen($str);
822 | for($i=0; $i<$len; $i++)
823 | {
824 | $c=ord($str[$i]);
825 | if($c > 128)
826 | {
827 | if(($c >= 254)) return false;
828 | elseif($c >= 252) $bits=6;
829 | elseif($c >= 248) $bits=5;
830 | elseif($c >= 240) $bits=4;
831 | elseif($c >= 224) $bits=3;
832 | elseif($c >= 192) $bits=2;
833 | else return false;
834 | if(($i+$bits) > $len) return false;
835 | while($bits > 1)
836 | {
837 | $i++;
838 | $b=ord($str[$i]);
839 | if($b < 128 || $b > 191) return false;
840 | $bits--;
841 | }
842 | }
843 | }
844 | return true;
845 | }
846 | /*
847 | function is_utf8($string)
848 | {
849 | //this is buggy
850 | return (utf8_encode(utf8_decode($string)) == $string);
851 | }
852 | */
853 |
854 | /**
855 | * Function to try a few tricks to determine the displayed size of an img on the page.
856 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
857 | *
858 | * @author John Schlick
859 | * @version April 19 2012
860 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
861 | */
862 | function get_display_size()
863 | {
864 | global $debugObject;
865 |
866 | $width = -1;
867 | $height = -1;
868 |
869 | if ($this->tag !== 'img')
870 | {
871 | return false;
872 | }
873 |
874 | // See if there is aheight or width attribute in the tag itself.
875 | if (isset($this->attr['width']))
876 | {
877 | $width = $this->attr['width'];
878 | }
879 |
880 | if (isset($this->attr['height']))
881 | {
882 | $height = $this->attr['height'];
883 | }
884 |
885 | // Now look for an inline style.
886 | if (isset($this->attr['style']))
887 | {
888 | // Thanks to user gnarf from stackoverflow for this regular expression.
889 | $attributes = array();
890 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
891 | foreach ($matches as $match) {
892 | $attributes[$match[1]] = $match[2];
893 | }
894 |
895 | // If there is a width in the style attributes:
896 | if (isset($attributes['width']) && $width == -1)
897 | {
898 | // check that the last two characters are px (pixels)
899 | if (strtolower(substr($attributes['width'], -2)) == 'px')
900 | {
901 | $proposed_width = substr($attributes['width'], 0, -2);
902 | // Now make sure that it's an integer and not something stupid.
903 | if (filter_var($proposed_width, FILTER_VALIDATE_INT))
904 | {
905 | $width = $proposed_width;
906 | }
907 | }
908 | }
909 |
910 | // If there is a width in the style attributes:
911 | if (isset($attributes['height']) && $height == -1)
912 | {
913 | // check that the last two characters are px (pixels)
914 | if (strtolower(substr($attributes['height'], -2)) == 'px')
915 | {
916 | $proposed_height = substr($attributes['height'], 0, -2);
917 | // Now make sure that it's an integer and not something stupid.
918 | if (filter_var($proposed_height, FILTER_VALIDATE_INT))
919 | {
920 | $height = $proposed_height;
921 | }
922 | }
923 | }
924 |
925 | }
926 |
927 | // Future enhancement:
928 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
929 |
930 | // Far future enhancement
931 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
932 | // Note that in this case, the class or id will have the img subselector for it to apply to the image.
933 |
934 | // ridiculously far future development
935 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
936 |
937 | $result = array('height' => $height,
938 | 'width' => $width);
939 | return $result;
940 | }
941 |
942 | // camel naming conventions
943 | function getAllAttributes() {return $this->attr;}
944 | function getAttribute($name) {return $this->__get($name);}
945 | function setAttribute($name, $value) {$this->__set($name, $value);}
946 | function hasAttribute($name) {return $this->__isset($name);}
947 | function removeAttribute($name) {$this->__set($name, null);}
948 | function getElementById($id) {return $this->find("#$id", 0);}
949 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
950 | function getElementByTagName($name) {return $this->find($name, 0);}
951 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
952 | function parentNode() {return $this->parent();}
953 | function childNodes($idx=-1) {return $this->children($idx);}
954 | function firstChild() {return $this->first_child();}
955 | function lastChild() {return $this->last_child();}
956 | function nextSibling() {return $this->next_sibling();}
957 | function previousSibling() {return $this->prev_sibling();}
958 | function hasChildNodes() {return $this->has_child();}
959 | function nodeName() {return $this->tag;}
960 | function appendChild($node) {$node->parent($this); return $node;}
961 |
962 | }
963 |
964 | /**
965 | * simple html dom parser
966 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
967 | * Paperg - change $size from protected to public so we can easily access it
968 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
969 | *
970 | * @package PlaceLocalInclude
971 | */
972 | class simple_html_dom
973 | {
974 | public $root = null;
975 | public $nodes = array();
976 | public $callback = null;
977 | public $lowercase = false;
978 | // Used to keep track of how large the text was when we started.
979 | public $original_size;
980 | public $size;
981 | protected $pos;
982 | protected $doc;
983 | protected $char;
984 | protected $cursor;
985 | protected $parent;
986 | protected $noise = array();
987 | protected $token_blank = " \t\r\n";
988 | protected $token_equal = ' =/>';
989 | protected $token_slash = " />\r\n\t";
990 | protected $token_attr = ' >';
991 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
992 | public $_charset = '';
993 | public $_target_charset = '';
994 | protected $default_br_text = "";
995 | public $default_span_text = "";
996 |
997 | // use isset instead of in_array, performance boost about 30%...
998 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
999 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1000 | // Known sourceforge issue #2977341
1001 | // B tags that are not closed cause us to return everything to the end of the document.
1002 | protected $optional_closing_tags = array(
1003 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1004 | 'th'=>array('th'=>1),
1005 | 'td'=>array('td'=>1),
1006 | 'li'=>array('li'=>1),
1007 | 'dt'=>array('dt'=>1, 'dd'=>1),
1008 | 'dd'=>array('dd'=>1, 'dt'=>1),
1009 | 'dl'=>array('dd'=>1, 'dt'=>1),
1010 | 'p'=>array('p'=>1),
1011 | 'nobr'=>array('nobr'=>1),
1012 | 'b'=>array('b'=>1),
1013 | 'option'=>array('option'=>1),
1014 | );
1015 |
1016 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1017 | {
1018 | if ($str)
1019 | {
1020 | if (preg_match("/^http:\/\//i",$str) || is_file($str))
1021 | {
1022 | $this->load_file($str);
1023 | }
1024 | else
1025 | {
1026 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1027 | }
1028 | }
1029 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1030 | if (!$forceTagsClosed) {
1031 | $this->optional_closing_array=array();
1032 | }
1033 | $this->_target_charset = $target_charset;
1034 | }
1035 |
1036 | function __destruct()
1037 | {
1038 | $this->clear();
1039 | }
1040 |
1041 | // load html from string
1042 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1043 | {
1044 | global $debugObject;
1045 |
1046 | // prepare
1047 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 | // strip out comments
1049 | $this->remove_noise("''is");
1050 | // strip out cdata
1051 | $this->remove_noise("''is", true);
1052 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 | // Script tags removal now preceeds style tag removal.
1054 | // strip out