├── .github
└── screenshots
│ ├── screencapture-localhost-image-extractor-1453627899413.png
│ └── screencapture-localhost-image-extractor-extract-php-1453627927292.png
├── README.md
├── api
└── image-parser.php
├── assets
├── css
│ └── style.css
├── img
│ ├── bg.png
│ ├── image-parser.png
│ └── right-dusk-blue@2x.png
└── js
│ └── script.js
└── index.html
/.github/screenshots/screencapture-localhost-image-extractor-1453627899413.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/.github/screenshots/screencapture-localhost-image-extractor-1453627899413.png
--------------------------------------------------------------------------------
/.github/screenshots/screencapture-localhost-image-extractor-extract-php-1453627927292.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/.github/screenshots/screencapture-localhost-image-extractor-extract-php-1453627927292.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Image Parser
2 | Image Parser is a web-application which extracts all the images on any public URL.
3 | You just need to enter the name (URL) of the website and you get all the images which are visible and added in CSS files for that page.
4 |
5 | It works for almost all of websites and extracts all the images.
6 |
7 | Same can be done as an API. You simply need to provide the url of the page you want to extract as parameter and you will get the URLs of all the images as a JSON Array as response.
8 |
9 | 
10 |
11 | 
12 |
--------------------------------------------------------------------------------
/api/image-parser.php:
--------------------------------------------------------------------------------
1 | $url,
45 | 'valid_url' => false,
46 | 'success' => false
47 | );
48 |
49 | } else {
50 |
51 | $final_response['valid_url'] = true;
52 |
53 | /**
54 | * check if there is a trailing slash (/) or not, if there is one, remove it
55 | */
56 | if (substr($url, strlen($url) - 1) == '/')
57 | $url = rtrim($url, "/");
58 |
59 | $parts = explode('/', $url);
60 |
61 | /**
62 | * parent domain name called, if there is a subdomain, it would also be included here
63 | * @var string
64 | */
65 | $Root = $parts[0] . '//' . $parts[2];
66 |
67 | $html = curl_URL_call($url);
68 | if (empty($html)) {
69 |
70 | return array(
71 | 'url_searched' => $url,
72 | 'valid_url' => false,
73 | 'success' => false,
74 | 'message' => 'We are unable to access the given URL: ' . $url
75 | );
76 | }
77 |
78 | $dom = new DOMDocument;
79 | $dom->loadHTML($html);
80 |
81 | $final_response['url_searched'] = $url;
82 | $final_response['parent_url'] = $Root;
83 |
84 | /**
85 | * check if there is any image in HTML source code or not
86 | */
87 | if (preg_match_all('/]+>/i', $html, $result)) {
88 | $final_response['success'] = true;
89 |
90 | foreach ($result[0] as $key) {
91 | preg_match('/src="([^"]+)/i', $key, $src_key);
92 |
93 | for ($i = 0; $i < count($src_key); $i += 2) {
94 | $src = $src_key[1];
95 |
96 | if (!preg_match("/http:/", $src) && !preg_match("/https:/", $src)) {
97 | /**
98 | * check whether the URL in the src is absolute or relative
99 | * if it is relative, make it absolute
100 | */
101 | if ($src[0] == '/' && $src[1] == '/') {
102 | $src = 'http:' . $src;
103 | } else if ($src[0] == '/') {
104 | $src = $Root . $src;
105 | } else {
106 | $src = $Root . '/' . $src;
107 | }
108 | }
109 | array_push($images, $src);
110 | }
111 | }
112 |
113 | } else {
114 | /**
115 | * No images were found in the HTML
116 | * source code, hence success if false
117 | */
118 | $final_response['success'] = false;
119 | }
120 |
121 | /**
122 | * Getting urls for stylesheets in the webpage
123 | */
124 | foreach ($dom->getElementsByTagName('link') as $node) {
125 | if ($node->getAttribute("rel") == "stylesheet") {
126 | $css_route = $node->getAttribute("href");
127 | /**
128 | * check whether the URL in the $css_route is absolute or relative
129 | * if it is relative, make it absolute
130 | */
131 | if ($css_route[0] == '/' && $css_route[1] == '/') {
132 | $css_route = 'http:' . $css_route;
133 | } else if ($css_route[0] == '/') {
134 | $css_route = $Root . $css_route;
135 | } else if ($css_route[0] != 'h') {
136 | $css_route = $Root . '/' . $css_route;
137 | }
138 | $parts = explode('/', $css_route);
139 | $parts_length = sizeof($parts);
140 | $css_root = $parts[0] . '//' . $parts[2];
141 | $css_active_dir = $css_root;
142 | $css_parent_dir = $css_root;
143 | for ($i = 3; $i < $parts_length - 1; ++$i) {
144 | if ($i < $parts_length - 2) {
145 | $css_active_dir = $css_active_dir . '/' . $parts[$i];
146 | $css_parent_dir = $css_parent_dir . '/' . $parts[$i];
147 | } else {
148 | $css_active_dir = $css_active_dir . '/' . $parts[$i];
149 | }
150 | }
151 | $css = curl_URL_call($css_route);
152 | $matches = array();
153 | /**
154 | * Getting image urls using image extension matches in stylesheet extracted
155 | */
156 | preg_match_all('/url\(\s*[\'"]?(\S*\.(?:jpe?g|gif|png))[\'"]?\s*\)[^;}]*?/i', $css, $matches);
157 |
158 | foreach ($matches[1] as $image_link) {
159 | /**
160 | * check whether the URL in the $image_link is absolute or relative
161 | * if it is relative, make it absolute
162 | */
163 | if ($image_link[0] == '.' && $image_link[1] == '.') {
164 | $image_link = $css_parent_dir . substr($image_link, 2);
165 | } else if ($image_link[0] == '.') {
166 | $image_link = $css_active_dir . substr($image_link, 1);
167 | } else if ($image_link[0] == '/') {
168 | $image_link = $css_active_dir . $image_link;
169 | } else {
170 | $image_link = $css_active_dir . '/' . $image_link;
171 | }
172 | array_push($images, $image_link);
173 | }
174 | }
175 | }
176 | }
177 |
178 | /**
179 | * All the images are added to the images array in
180 | * final response
181 | */
182 | $final_response['images'] = $images;
183 | return $final_response;
184 |
185 | } else {
186 | $message = "Please enter a URL to extract information as a 'url' parameter in GET request";
187 | return array(
188 | 'url_searched' => null,
189 | 'valid_url' => false,
190 | 'success' => false,
191 | 'message' => $message,
192 | );
193 | }
194 | }
195 |
196 |
197 | /**
198 | * function to check if the URL entered by the user is correct or not
199 | * @param string $url URL to be passed which is to be checked
200 | * @return boolean returns if URL passed is valid or not
201 | */
202 | function isValidURL($url){
203 | return preg_match('|^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i', $url);
204 | }
205 |
206 |
207 | /**
208 | * function to make a CURL call in order to fetch the complete HTML source code of URL entered
209 | * @param string $url URL of the page
210 | * @return string HTML source code of the URL entered
211 | */
212 | function curl_URL_call($url){
213 | $ch = curl_init();
214 | curl_setopt($ch, CURLOPT_URL, $url);
215 | curl_setopt($ch, CURLOPT_HEADER, 0);
216 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
217 | $output = curl_exec($ch);
218 | curl_close($ch);
219 | return $output;
220 | }
221 |
--------------------------------------------------------------------------------
/assets/css/style.css:
--------------------------------------------------------------------------------
1 | @import url(https://fonts.googleapis.com/css?family=Roboto:400,900);
2 |
3 | body{
4 | font-family: 'Roboto', sans-serif;
5 | background-color:rgb(249, 249, 249);
6 | }
7 | .github-fork{
8 | position: absolute;
9 | z-index: 1000;
10 | right: 0;
11 | top: 0;
12 | width: 200px;
13 | }
14 | .action-buttons{
15 | background-color: #f2f2f2;
16 | color: #383838;
17 | font-size: 14px;
18 | padding: 7px 18px;
19 | cursor: pointer;
20 | box-shadow: 0 1px 2px -1px rgba(0,0,0,.5);
21 | -webkit-box-shadow: 0 1px 2px 0 rgba(0,0,0,.2);
22 | width: auto;
23 | border: none;
24 | font-weight: 700;
25 | }
26 | .form-main{
27 | text-align: center;
28 | }
29 | .form-main .form-control{
30 | border-radius: 0px;
31 | border: 1px solid black;
32 | }
33 | .form-main .btn{
34 | border-radius: 0px;
35 | }
36 | .header-image{
37 | background-color: rgb(239, 250, 141);
38 | background-image: url('../img/bg.png');
39 | padding: 20px;
40 | text-align: center;
41 | }
42 | .other-text{
43 | font-size : 16px;
44 | margin: 10px;
45 | text-align: center;
46 | }
47 |
48 | /* Spinner section
49 | see: http://tobiasahlin.com/spinkit/
50 | */
51 |
52 | .spinner {
53 | margin: 25px auto;
54 | width: 50px;
55 | height: 40px;
56 | text-align: center;
57 | font-size: 10px;
58 | }
59 |
60 | .spinner > div {
61 | background-color: #79ACC3;
62 | height: 100%;
63 | width: 6px;
64 | display: inline-block;
65 |
66 | -webkit-animation: sk-stretchdelay 1.2s infinite ease-in-out;
67 | animation: sk-stretchdelay 1.2s infinite ease-in-out;
68 | }
69 |
70 | .spinner .rect2 {
71 | -webkit-animation-delay: -1.1s;
72 | animation-delay: -1.1s;
73 | }
74 |
75 | .spinner .rect3 {
76 | -webkit-animation-delay: -1.0s;
77 | animation-delay: -1.0s;
78 | }
79 |
80 | .spinner .rect4 {
81 | -webkit-animation-delay: -0.9s;
82 | animation-delay: -0.9s;
83 | }
84 |
85 | .spinner .rect5 {
86 | -webkit-animation-delay: -0.8s;
87 | animation-delay: -0.8s;
88 | }
89 |
90 | @-webkit-keyframes sk-stretchdelay {
91 | 0%, 40%, 100% { -webkit-transform: scaleY(0.4) }
92 | 20% { -webkit-transform: scaleY(1.0) }
93 | }
94 |
95 | @keyframes sk-stretchdelay {
96 | 0%, 40%, 100% {
97 | transform: scaleY(0.4);
98 | -webkit-transform: scaleY(0.4);
99 | } 20% {
100 | transform: scaleY(1.0);
101 | -webkit-transform: scaleY(1.0);
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/assets/img/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/bg.png
--------------------------------------------------------------------------------
/assets/img/image-parser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/image-parser.png
--------------------------------------------------------------------------------
/assets/img/right-dusk-blue@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/right-dusk-blue@2x.png
--------------------------------------------------------------------------------
/assets/js/script.js:
--------------------------------------------------------------------------------
1 | $(document).ready(function() {
2 | var spinner = toggleSpinner(),
3 | submit_button = $("#submit");
4 | submit_button.click(function(e) {
5 | e.preventDefault();
6 | spinner();
7 |
8 | $.ajax({
9 | url: 'api/image-parser.php?url=' + $('input[name="url"]').val(),
10 | dataType: 'json',
11 | beforeSend: function() {
12 | submit_button.text("Extracting...");
13 | submit_button.prop('disabled', true);
14 | },
15 | success: function(result) {
16 | spinner();
17 | if (result.success) {
18 | renderStats(result);
19 | renderImages(result);
20 | } else {
21 | $('#result').html('Invalid URL!');
22 | }
23 | },
24 | error: function(xhr, resp, text) {
25 | spinner();
26 | $('#result').html('Could not connect to server, please try again later');
27 | },
28 | complete: function() {
29 | submit_button.text("Extract");
30 | submit_button.prop('disabled', false);
31 | }
32 | })
33 | });
34 | });
35 |
36 |
37 | function renderStats(result) {
38 | var stats =
39 | 'URL Searched : ' + $('input[name="url"]').val() + '
' +
40 | 'Parent Domain : ' + result.parent_url + '
';
41 | $('#stats .other-text').empty().append(stats);
42 | }
43 |
44 |
45 | function renderImages(result) {
46 | var images;
47 | if (0 == result.images.length) {
48 | images = 'No Image Found at your Given Location';
49 | } else {
50 | images = result.images.map(function(image) {
51 | return '';
52 | });
53 | }
54 | $('#result').empty().append(images);
55 | }
56 |
57 |
58 | function toggleSpinner() {
59 | var isHidden = true;
60 | return function() {
61 | if (isHidden) {
62 | $('#stats').hide();
63 | $('#result').empty();
64 | $('.spinner').show();
65 | } else {
66 | $('#stats').show();
67 | $('.spinner').hide();
68 | }
69 | isHidden = !isHidden;
70 | }
71 | }
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |