├── .github └── screenshots │ ├── screencapture-localhost-image-extractor-1453627899413.png │ └── screencapture-localhost-image-extractor-extract-php-1453627927292.png ├── README.md ├── api └── image-parser.php ├── assets ├── css │ └── style.css ├── img │ ├── bg.png │ ├── image-parser.png │ └── right-dusk-blue@2x.png └── js │ └── script.js └── index.html /.github/screenshots/screencapture-localhost-image-extractor-1453627899413.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/.github/screenshots/screencapture-localhost-image-extractor-1453627899413.png -------------------------------------------------------------------------------- /.github/screenshots/screencapture-localhost-image-extractor-extract-php-1453627927292.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/.github/screenshots/screencapture-localhost-image-extractor-extract-php-1453627927292.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Image Parser 2 | Image Parser is a web-application which extracts all the images on any public URL. 3 | You just need to enter the name (URL) of the website and you get all the images which are visible and added in CSS files for that page. 4 | 5 | It works for almost all of websites and extracts all the images. 6 | 7 | Same can be done as an API. You simply need to provide the url of the page you want to extract as parameter and you will get the URLs of all the images as a JSON Array as response. 8 | 9 | ![App Screenshot](/.github/screenshots/screencapture-localhost-image-extractor-1453627899413.png?raw=true) 10 | 11 | ![API Response](/.github/screenshots/screencapture-localhost-image-extractor-extract-php-1453627927292.png?raw=true) 12 | -------------------------------------------------------------------------------- /api/image-parser.php: -------------------------------------------------------------------------------- 1 | $url, 45 | 'valid_url' => false, 46 | 'success' => false 47 | ); 48 | 49 | } else { 50 | 51 | $final_response['valid_url'] = true; 52 | 53 | /** 54 | * check if there is a trailing slash (/) or not, if there is one, remove it 55 | */ 56 | if (substr($url, strlen($url) - 1) == '/') 57 | $url = rtrim($url, "/"); 58 | 59 | $parts = explode('/', $url); 60 | 61 | /** 62 | * parent domain name called, if there is a subdomain, it would also be included here 63 | * @var string 64 | */ 65 | $Root = $parts[0] . '//' . $parts[2]; 66 | 67 | $html = curl_URL_call($url); 68 | if (empty($html)) { 69 | 70 | return array( 71 | 'url_searched' => $url, 72 | 'valid_url' => false, 73 | 'success' => false, 74 | 'message' => 'We are unable to access the given URL: ' . $url 75 | ); 76 | } 77 | 78 | $dom = new DOMDocument; 79 | $dom->loadHTML($html); 80 | 81 | $final_response['url_searched'] = $url; 82 | $final_response['parent_url'] = $Root; 83 | 84 | /** 85 | * check if there is any image in HTML source code or not 86 | */ 87 | if (preg_match_all('/]+>/i', $html, $result)) { 88 | $final_response['success'] = true; 89 | 90 | foreach ($result[0] as $key) { 91 | preg_match('/src="([^"]+)/i', $key, $src_key); 92 | 93 | for ($i = 0; $i < count($src_key); $i += 2) { 94 | $src = $src_key[1]; 95 | 96 | if (!preg_match("/http:/", $src) && !preg_match("/https:/", $src)) { 97 | /** 98 | * check whether the URL in the src is absolute or relative 99 | * if it is relative, make it absolute 100 | */ 101 | if ($src[0] == '/' && $src[1] == '/') { 102 | $src = 'http:' . $src; 103 | } else if ($src[0] == '/') { 104 | $src = $Root . $src; 105 | } else { 106 | $src = $Root . '/' . $src; 107 | } 108 | } 109 | array_push($images, $src); 110 | } 111 | } 112 | 113 | } else { 114 | /** 115 | * No images were found in the HTML 116 | * source code, hence success if false 117 | */ 118 | $final_response['success'] = false; 119 | } 120 | 121 | /** 122 | * Getting urls for stylesheets in the webpage 123 | */ 124 | foreach ($dom->getElementsByTagName('link') as $node) { 125 | if ($node->getAttribute("rel") == "stylesheet") { 126 | $css_route = $node->getAttribute("href"); 127 | /** 128 | * check whether the URL in the $css_route is absolute or relative 129 | * if it is relative, make it absolute 130 | */ 131 | if ($css_route[0] == '/' && $css_route[1] == '/') { 132 | $css_route = 'http:' . $css_route; 133 | } else if ($css_route[0] == '/') { 134 | $css_route = $Root . $css_route; 135 | } else if ($css_route[0] != 'h') { 136 | $css_route = $Root . '/' . $css_route; 137 | } 138 | $parts = explode('/', $css_route); 139 | $parts_length = sizeof($parts); 140 | $css_root = $parts[0] . '//' . $parts[2]; 141 | $css_active_dir = $css_root; 142 | $css_parent_dir = $css_root; 143 | for ($i = 3; $i < $parts_length - 1; ++$i) { 144 | if ($i < $parts_length - 2) { 145 | $css_active_dir = $css_active_dir . '/' . $parts[$i]; 146 | $css_parent_dir = $css_parent_dir . '/' . $parts[$i]; 147 | } else { 148 | $css_active_dir = $css_active_dir . '/' . $parts[$i]; 149 | } 150 | } 151 | $css = curl_URL_call($css_route); 152 | $matches = array(); 153 | /** 154 | * Getting image urls using image extension matches in stylesheet extracted 155 | */ 156 | preg_match_all('/url\(\s*[\'"]?(\S*\.(?:jpe?g|gif|png))[\'"]?\s*\)[^;}]*?/i', $css, $matches); 157 | 158 | foreach ($matches[1] as $image_link) { 159 | /** 160 | * check whether the URL in the $image_link is absolute or relative 161 | * if it is relative, make it absolute 162 | */ 163 | if ($image_link[0] == '.' && $image_link[1] == '.') { 164 | $image_link = $css_parent_dir . substr($image_link, 2); 165 | } else if ($image_link[0] == '.') { 166 | $image_link = $css_active_dir . substr($image_link, 1); 167 | } else if ($image_link[0] == '/') { 168 | $image_link = $css_active_dir . $image_link; 169 | } else { 170 | $image_link = $css_active_dir . '/' . $image_link; 171 | } 172 | array_push($images, $image_link); 173 | } 174 | } 175 | } 176 | } 177 | 178 | /** 179 | * All the images are added to the images array in 180 | * final response 181 | */ 182 | $final_response['images'] = $images; 183 | return $final_response; 184 | 185 | } else { 186 | $message = "Please enter a URL to extract information as a 'url' parameter in GET request"; 187 | return array( 188 | 'url_searched' => null, 189 | 'valid_url' => false, 190 | 'success' => false, 191 | 'message' => $message, 192 | ); 193 | } 194 | } 195 | 196 | 197 | /** 198 | * function to check if the URL entered by the user is correct or not 199 | * @param string $url URL to be passed which is to be checked 200 | * @return boolean returns if URL passed is valid or not 201 | */ 202 | function isValidURL($url){ 203 | return preg_match('|^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i', $url); 204 | } 205 | 206 | 207 | /** 208 | * function to make a CURL call in order to fetch the complete HTML source code of URL entered 209 | * @param string $url URL of the page 210 | * @return string HTML source code of the URL entered 211 | */ 212 | function curl_URL_call($url){ 213 | $ch = curl_init(); 214 | curl_setopt($ch, CURLOPT_URL, $url); 215 | curl_setopt($ch, CURLOPT_HEADER, 0); 216 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 217 | $output = curl_exec($ch); 218 | curl_close($ch); 219 | return $output; 220 | } 221 | -------------------------------------------------------------------------------- /assets/css/style.css: -------------------------------------------------------------------------------- 1 | @import url(https://fonts.googleapis.com/css?family=Roboto:400,900); 2 | 3 | body{ 4 | font-family: 'Roboto', sans-serif; 5 | background-color:rgb(249, 249, 249); 6 | } 7 | .github-fork{ 8 | position: absolute; 9 | z-index: 1000; 10 | right: 0; 11 | top: 0; 12 | width: 200px; 13 | } 14 | .action-buttons{ 15 | background-color: #f2f2f2; 16 | color: #383838; 17 | font-size: 14px; 18 | padding: 7px 18px; 19 | cursor: pointer; 20 | box-shadow: 0 1px 2px -1px rgba(0,0,0,.5); 21 | -webkit-box-shadow: 0 1px 2px 0 rgba(0,0,0,.2); 22 | width: auto; 23 | border: none; 24 | font-weight: 700; 25 | } 26 | .form-main{ 27 | text-align: center; 28 | } 29 | .form-main .form-control{ 30 | border-radius: 0px; 31 | border: 1px solid black; 32 | } 33 | .form-main .btn{ 34 | border-radius: 0px; 35 | } 36 | .header-image{ 37 | background-color: rgb(239, 250, 141); 38 | background-image: url('../img/bg.png'); 39 | padding: 20px; 40 | text-align: center; 41 | } 42 | .other-text{ 43 | font-size : 16px; 44 | margin: 10px; 45 | text-align: center; 46 | } 47 | 48 | /* Spinner section 49 | see: http://tobiasahlin.com/spinkit/ 50 | */ 51 | 52 | .spinner { 53 | margin: 25px auto; 54 | width: 50px; 55 | height: 40px; 56 | text-align: center; 57 | font-size: 10px; 58 | } 59 | 60 | .spinner > div { 61 | background-color: #79ACC3; 62 | height: 100%; 63 | width: 6px; 64 | display: inline-block; 65 | 66 | -webkit-animation: sk-stretchdelay 1.2s infinite ease-in-out; 67 | animation: sk-stretchdelay 1.2s infinite ease-in-out; 68 | } 69 | 70 | .spinner .rect2 { 71 | -webkit-animation-delay: -1.1s; 72 | animation-delay: -1.1s; 73 | } 74 | 75 | .spinner .rect3 { 76 | -webkit-animation-delay: -1.0s; 77 | animation-delay: -1.0s; 78 | } 79 | 80 | .spinner .rect4 { 81 | -webkit-animation-delay: -0.9s; 82 | animation-delay: -0.9s; 83 | } 84 | 85 | .spinner .rect5 { 86 | -webkit-animation-delay: -0.8s; 87 | animation-delay: -0.8s; 88 | } 89 | 90 | @-webkit-keyframes sk-stretchdelay { 91 | 0%, 40%, 100% { -webkit-transform: scaleY(0.4) } 92 | 20% { -webkit-transform: scaleY(1.0) } 93 | } 94 | 95 | @keyframes sk-stretchdelay { 96 | 0%, 40%, 100% { 97 | transform: scaleY(0.4); 98 | -webkit-transform: scaleY(0.4); 99 | } 20% { 100 | transform: scaleY(1.0); 101 | -webkit-transform: scaleY(1.0); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /assets/img/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/bg.png -------------------------------------------------------------------------------- /assets/img/image-parser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/image-parser.png -------------------------------------------------------------------------------- /assets/img/right-dusk-blue@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prabhakar267/image-parser/bbfd92486d6314e3881215f3b9c0d5ec7597e629/assets/img/right-dusk-blue@2x.png -------------------------------------------------------------------------------- /assets/js/script.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | var spinner = toggleSpinner(), 3 | submit_button = $("#submit"); 4 | submit_button.click(function(e) { 5 | e.preventDefault(); 6 | spinner(); 7 | 8 | $.ajax({ 9 | url: 'api/image-parser.php?url=' + $('input[name="url"]').val(), 10 | dataType: 'json', 11 | beforeSend: function() { 12 | submit_button.text("Extracting..."); 13 | submit_button.prop('disabled', true); 14 | }, 15 | success: function(result) { 16 | spinner(); 17 | if (result.success) { 18 | renderStats(result); 19 | renderImages(result); 20 | } else { 21 | $('#result').html('Invalid URL!'); 22 | } 23 | }, 24 | error: function(xhr, resp, text) { 25 | spinner(); 26 | $('#result').html('Could not connect to server, please try again later'); 27 | }, 28 | complete: function() { 29 | submit_button.text("Extract"); 30 | submit_button.prop('disabled', false); 31 | } 32 | }) 33 | }); 34 | }); 35 | 36 | 37 | function renderStats(result) { 38 | var stats = 39 | 'URL Searched : ' + $('input[name="url"]').val() + '
' + 40 | 'Parent Domain : ' + result.parent_url + '
'; 41 | $('#stats .other-text').empty().append(stats); 42 | } 43 | 44 | 45 | function renderImages(result) { 46 | var images; 47 | if (0 == result.images.length) { 48 | images = 'No Image Found at your Given Location'; 49 | } else { 50 | images = result.images.map(function(image) { 51 | return ''; 52 | }); 53 | } 54 | $('#result').empty().append(images); 55 | } 56 | 57 | 58 | function toggleSpinner() { 59 | var isHidden = true; 60 | return function() { 61 | if (isHidden) { 62 | $('#stats').hide(); 63 | $('#result').empty(); 64 | $('.spinner').show(); 65 | } else { 66 | $('#stats').show(); 67 | $('.spinner').hide(); 68 | } 69 | isHidden = !isHidden; 70 | } 71 | } -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Image Parser 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
26 |
27 |

Image Parser

28 |

Enter any URL and get all the images on the page

29 | 30 | 31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | 39 |
40 | 41 |
42 |
43 |
44 |
45 | 48 | 55 |
Welcome :)
56 | 57 | 58 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | --------------------------------------------------------------------------------