├── phpstan.neon ├── .gitignore ├── .github ├── dependabot.yml └── workflows │ └── php.yml ├── src ├── Contracts │ ├── ParserInterface.php │ └── ExtractorInterface.php ├── Exceptions │ └── InvalidURLException.php ├── Model │ └── Link.php ├── Client.php ├── Extractors │ ├── LocaleExtractor.php │ ├── ImageExtractor.php │ ├── TitleExtractor.php │ ├── FaviconExtractor.php │ └── DescriptionExtractor.php ├── Traits │ └── CanExtractBySelector.php └── Parsers │ └── SiteParser.php ├── .php-cs-fixer.php ├── phpunit.xml ├── tests ├── Unit │ ├── Extractors │ │ ├── LocaleExtractorTest.php │ │ └── FaviconExtractor.php │ ├── SiteParserTest.php │ └── ClientTest.php ├── TestCase.php ├── Stubs │ └── response.html └── Pest.php ├── LICENSE ├── composer.json └── README.md /phpstan.neon: -------------------------------------------------------------------------------- 1 | parameters: 2 | level: 6 3 | paths: 4 | - src -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | vendor/ 3 | .php-cs-fixer.cache 4 | composer.lock 5 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "composer" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /src/Contracts/ParserInterface.php: -------------------------------------------------------------------------------- 1 | in(["src", "tests"]) 4 | ; 5 | 6 | $config = new PhpCsFixer\Config(); 7 | return $config->setRules([ 8 | '@PSR12' => true, 9 | 'array_syntax' => ['syntax' => 'short'], 10 | ]) 11 | ->setFinder($finder) 12 | ; -------------------------------------------------------------------------------- /src/Exceptions/InvalidURLException.php: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | ./tests 10 | 11 | 12 | 13 | 14 | ./app 15 | ./src 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/Client.php: -------------------------------------------------------------------------------- 1 | parser = $parser ?? new SiteParser(); 18 | } 19 | 20 | /** 21 | * @throws InvalidURLException 22 | */ 23 | public function parse(string $url): Link 24 | { 25 | return $this->parser->parse($url); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Extractors/LocaleExtractor.php: -------------------------------------------------------------------------------- 1 | > $selectors 15 | */ 16 | private static array $selectors = [ 17 | ['selector' => 'html', 'attribute' => 'lang'], 18 | ]; 19 | 20 | public static function extract(Crawler $crawler): string 21 | { 22 | return self::extractSelectors($crawler); 23 | } 24 | 25 | public static function name(): string 26 | { 27 | return 'locale'; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/Unit/Extractors/LocaleExtractorTest.php: -------------------------------------------------------------------------------- 1 | createSiteParserWithMockFakeHttpClient([ 9 | new Response(body: '') 10 | ]); 11 | 12 | $link = $parser->parse('https://hazaveh.net'); 13 | 14 | assertEquals("en", $link->locale); 15 | }); 16 | 17 | test('it handle page without locale', function () { 18 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 19 | new Response(body: '') 20 | ]); 21 | 22 | $link = $parser->parse('https://hazaveh.net'); 23 | 24 | assertEquals("", $link->locale); 25 | }); 26 | -------------------------------------------------------------------------------- /src/Traits/CanExtractBySelector.php: -------------------------------------------------------------------------------- 1 | filter($selector['selector'])->count() > 0) { 20 | $data[] = isset($selector['attribute']) 21 | ? $crawler->filter($selector['selector'])->first()->attr($selector['attribute']) 22 | : $crawler->filter($selector['selector'])->first()->text(); 23 | } 24 | } 25 | 26 | return $data[0] ?? ""; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/Extractors/ImageExtractor.php: -------------------------------------------------------------------------------- 1 | > $selectors 15 | */ 16 | private static array $selectors = [ 17 | ['selector' => 'meta[property="twitter:image"]', 'attribute' => 'content'], 18 | ['selector' => 'meta[property="og:image"]', 'attribute' => 'content'], 19 | ['selector' => 'meta[itemprop="image"]', 'attribute' => 'content'], 20 | ]; 21 | 22 | public static function extract(Crawler $crawler): string 23 | { 24 | return self::extractSelectors($crawler); 25 | } 26 | 27 | public static function name(): string 28 | { 29 | return 'image'; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /.github/workflows/php.yml: -------------------------------------------------------------------------------- 1 | name: PHP Composer 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: [ "master" ] 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - name: Validate composer.json and composer.lock 20 | run: composer validate --strict 21 | 22 | - name: Cache Composer packages 23 | id: composer-cache 24 | uses: actions/cache@v3 25 | with: 26 | path: vendor 27 | key: ${{ runner.os }}-php-${{ hashFiles('**/composer.lock') }} 28 | restore-keys: | 29 | ${{ runner.os }}-php- 30 | 31 | - name: Install dependencies 32 | run: composer install --prefer-dist --no-progress 33 | 34 | - name: Run static analyzer 35 | run: composer analyze 36 | 37 | - name: Run test suite 38 | run: composer test 39 | 40 | - name: Run Codestyle Checks 41 | run: composer codestyle 42 | -------------------------------------------------------------------------------- /src/Extractors/TitleExtractor.php: -------------------------------------------------------------------------------- 1 | > $selectors 15 | */ 16 | private static array $selectors = [ 17 | ['selector' => 'meta[property="twitter:title"]', 'attribute' => 'content'], 18 | ['selector' => 'meta[property="og:title"]', 'attribute' => 'content'], 19 | ['selector' => 'meta[itemprop="name"]', 'attribute' => 'content'], 20 | ['selector' => 'title'] 21 | ]; 22 | public static function extract(Crawler $crawler): string 23 | { 24 | return self::extractSelectors($crawler); 25 | } 26 | 27 | public static function name(): string 28 | { 29 | return 'title'; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tests/TestCase.php: -------------------------------------------------------------------------------- 1 | $handlerStack, 'http_errors' => false]); 24 | 25 | } 26 | 27 | /** 28 | * @param Response[] $responses 29 | * @return SiteParser 30 | */ 31 | public function createSiteParserWithMockFakeHttpClient(array $responses): SiteParser 32 | { 33 | $parser = new SiteParser(); 34 | $parser->setClient($this->createMockHttpClient($responses)); 35 | return $parser; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/Extractors/FaviconExtractor.php: -------------------------------------------------------------------------------- 1 | > $selectors 15 | */ 16 | private static array $selectors = [ 17 | ['selector' => 'link[rel="icon"]', 'attribute' => 'href'] 18 | ]; 19 | 20 | public static function extract(Crawler $crawler): string 21 | { 22 | $icon = self::extractSelectors($crawler); 23 | // If the icon is a relative path, we need to convert it to an absolute path 24 | if (strpos($icon, 'http') !== 0) { 25 | $icon = $crawler->getUri() . $icon; 26 | } 27 | return preg_replace('/([^:])(\\/{2,})/', '$1/', $icon); 28 | } 29 | 30 | public static function name(): string 31 | { 32 | return 'icon'; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Extractors/DescriptionExtractor.php: -------------------------------------------------------------------------------- 1 | > $selectors 15 | */ 16 | private static array $selectors = [ 17 | ['selector' => 'meta[property="twitter:description"]', 'attribute' => 'content'], 18 | ['selector' => 'meta[property="og:description"]', 'attribute' => 'content'], 19 | ['selector' => 'meta[itemprop="description"]', 'attribute' => 'content'], 20 | ['selector' => 'meta[name="description"]', 'attribute' => 'content'], 21 | ]; 22 | public static function extract(Crawler $crawler): string 23 | { 24 | return self::extractSelectors($crawler); 25 | } 26 | 27 | public static function name(): string 28 | { 29 | return "description"; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Mahdi Hazaveh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hazaveh/php-link-preview", 3 | "description": "PHP Link Preview library", 4 | "type": "library", 5 | "require": { 6 | "php": ">=8.1", 7 | "symfony/dom-crawler": "^3.0|^4.0|^5.0|^6.0|^7.0", 8 | "guzzlehttp/guzzle": "^6.1|^7.1", 9 | "symfony/css-selector": "^3.0|^4.0|^5.0|^6.0|^7.0" 10 | }, 11 | "require-dev": { 12 | "pestphp/pest": "^2.19", 13 | "friendsofphp/php-cs-fixer": "^3.28", 14 | "phpstan/phpstan": "^1.10" 15 | }, 16 | "license": "MIT", 17 | "autoload": { 18 | "psr-4": { 19 | "Hazaveh\\LinkPreview\\": "src/" 20 | } 21 | }, 22 | "autoload-dev": { 23 | "psr-4": { 24 | "Tests\\": "tests/" 25 | } 26 | }, 27 | "authors": [ 28 | { 29 | "name": "Mahdi", 30 | "email": "me@hazaveh.net" 31 | } 32 | ], 33 | "config": { 34 | "allow-plugins": { 35 | "pestphp/pest-plugin": true 36 | } 37 | }, 38 | "scripts": { 39 | "test": "@php vendor/bin/pest", 40 | "analyze": "@php vendor/bin/phpstan analyze", 41 | "codestyle": "@php vendor/bin/php-cs-fixer fix --config=.php-cs-fixer.php --verbose --diff --dry-run" 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tests/Unit/Extractors/FaviconExtractor.php: -------------------------------------------------------------------------------- 1 | createSiteParserWithMockFakeHttpClient([ 7 | new Response(body: '') 8 | ]); 9 | 10 | $link = $parser->parse('https://hazaveh.net'); 11 | 12 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico'); 13 | }); 14 | 15 | test('it handles absolute path', function () { 16 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 17 | new Response(body: '') 18 | ]); 19 | 20 | $link = $parser->parse('https://hazaveh.net'); 21 | 22 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico'); 23 | }); 24 | 25 | test('it handles trailing slashes', function () { 26 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 27 | new Response(body: '') 28 | ]); 29 | 30 | $link = $parser->parse('https://hazaveh.net/'); 31 | 32 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico'); 33 | }); 34 | -------------------------------------------------------------------------------- /tests/Unit/SiteParserTest.php: -------------------------------------------------------------------------------- 1 | parse('http:/google.com'); 11 | })->expectException(InvalidURLException::class); 12 | 13 | test('it can use a custom http client', function () { 14 | $httpClient = new \GuzzleHttp\Client(); 15 | $parser = new \Hazaveh\LinkPreview\Parsers\SiteParser(); 16 | $parser->setClient($httpClient); 17 | assertEquals($parser->client(), $httpClient); 18 | }); 19 | 20 | test('it can parse correctly', function () { 21 | /** @var \Hazaveh\LinkPreview\Parsers\SiteParser $parser */ 22 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 23 | new Response(body: file_get_contents(\Pest\testDirectory('Stubs/response.html'))) 24 | ]); 25 | 26 | $link = $parser->parse('https://hazaveh.net'); 27 | 28 | assertEquals("Your Page Title", $link->title); 29 | assertEquals("Your page description goes here.", $link->description); 30 | assertEquals("https://example.com/your-image.jpg", $link->image); 31 | assertEquals("https://example.com/favicon.ico", $link->icon); 32 | }); 33 | -------------------------------------------------------------------------------- /tests/Stubs/response.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | Your Page Title 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /tests/Pest.php: -------------------------------------------------------------------------------- 1 | in('Feature', 'Unit'); 16 | 17 | /* 18 | |-------------------------------------------------------------------------- 19 | | Expectations 20 | |-------------------------------------------------------------------------- 21 | | 22 | | When you're writing tests, you often need to check that values meet certain conditions. The 23 | | "expect()" function gives you access to a set of "expectations" methods that you can use 24 | | to assert different things. Of course, you may extend the Expectation API at any time. 25 | | 26 | */ 27 | 28 | expect()->extend('toBeOne', function () { 29 | return $this->toBe(1); 30 | }); 31 | 32 | /* 33 | |-------------------------------------------------------------------------- 34 | | Functions 35 | |-------------------------------------------------------------------------- 36 | | 37 | | While Pest is very powerful out-of-the-box, you may have some testing code specific to your 38 | | project that you don't want to repeat in every file. Here you can also expose helpers as 39 | | global functions to help you to reduce the number of lines of code in your test files. 40 | | 41 | */ 42 | 43 | function something() 44 | { 45 | // .. 46 | } 47 | -------------------------------------------------------------------------------- /tests/Unit/ClientTest.php: -------------------------------------------------------------------------------- 1 | parser); 17 | }); 18 | 19 | test('it accepts a custom parser', function () { 20 | class CustomParser implements ParserInterface 21 | { 22 | public function parse(string $url): Link 23 | { 24 | return new Link(url: $url); 25 | } 26 | } 27 | 28 | $client = new Client(new CustomParser()); 29 | 30 | assertInstanceOf(CustomParser::class, $client->parser); 31 | }); 32 | 33 | test('it can visit and parse a page', function () { 34 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 35 | new Response(body: file_get_contents(\Pest\testDirectory('Stubs/response.html'))) 36 | ]); 37 | 38 | $client = new Client($parser); 39 | 40 | $url = "https://hazaveh.net"; 41 | $link = $client->parse($url); 42 | 43 | assertInstanceOf(Link::class, $link); 44 | 45 | assertEquals($url, $link->url); 46 | }); 47 | 48 | test('it handles http errors', function () { 49 | $url = "https://hazaveh.net"; 50 | 51 | $parser = $this->createSiteParserWithMockFakeHttpClient([ 52 | new Response(404, ['Content-Length' => 0]), 53 | new RequestException('Error Communicating with Server', new Request('GET', $url)) 54 | ]); 55 | 56 | $client = new Client($parser); 57 | 58 | $link = $client->parse($url); 59 | 60 | assertInstanceOf(Link::class, $link); 61 | 62 | assertEquals(404, $link->error); 63 | 64 | $link = $client->parse($url); 65 | 66 | assertEquals(0, $link->error); 67 | }); 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # PHP Link Preview 6 | PHP Link Preview is a small library that can crawl and return the OG & Meta tags of an URL. This can be used in your applications to display a preview of a URL similar to what happens when you paste a link in Social Media sites or Whatsapp. 7 | 8 | ### Current Information 9 | * `title`: open graph title, if not found page title will be returned 10 | * `description`: open graph description, if not found page description from meta tag is returned 11 | * `image`: open graph image 12 | * `icon`: favicon (if icon is explicitly specified in the HTML source) 13 | 14 | ## Dependencies 15 | * PHP >= 8.2 16 | * Guzzle >= 6 17 | * Symfony DomCrawler >= 3.0 18 | * Symfony CssSelector >= 3.0 19 | 20 | ## Installation 21 | Simply run via composer: 22 | 23 | composer require hazaveh/php-link-preview 24 | 25 | ## Usage 26 | Create an instance of Client and use `parse` method to crawl a URL. 27 | ```php 28 | use Hazaveh\LinkPreview\Client; 29 | 30 | require_once 'vendor/autoload.php'; 31 | 32 | $client = new Client(); 33 | 34 | /** 35 | * Returns an instance of Hazaveh\LinkPreview\Model\Link 36 | * {title, description, image, icon, locale} 37 | */ 38 | 39 | $preview = $client->parse("https://hazaveh.net/2023/07/re-inventing-bookmarks-for-teams/"); 40 | ``` 41 | 42 | ## Using Custom Parser 43 | Out of the box this library comes with a Parser that uses included extractor classes to extract different pieces of information from the page. You can always use a custom parser that implements `ParserInterface` and have your own logic to extract information from the page. 44 | 45 | You are also free to use a Custom Link class which would then include additional information you might want to parse off the website during parsing process. 46 | 47 | ```php 48 | class CustomParser implements ParserInterface 49 | { 50 | public function parse(string $url): Link 51 | { 52 | return new Link(url: $url); 53 | } 54 | } 55 | 56 | $client = new Client(new CustomParser()); 57 | ``` 58 | 59 | ## Contribution 60 | Do something cool and add a PR. -------------------------------------------------------------------------------- /src/Parsers/SiteParser.php: -------------------------------------------------------------------------------- 1 | validate($url); 36 | $html = $this->visit($url); 37 | 38 | if (!$html) { 39 | return new Link(url: $url, description: "Invalid response code {$this->errorCode}", error: $this->errorCode); 40 | } 41 | 42 | $data = $this->extractTags($url, $html); 43 | 44 | return new Link($url, $data['title'], $data['description'], $data['image'], $data['icon'], locale: $data['locale']); 45 | 46 | } 47 | 48 | private function visit(string $url): string | bool 49 | { 50 | 51 | try { 52 | /** @phpstan-ignore-next-line */ 53 | $response = $this->client()->get($url); 54 | 55 | if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) { 56 | return $response->getBody()->getContents(); 57 | } 58 | $this->errorCode = $response->getStatusCode(); 59 | } catch (GuzzleException $exception) { 60 | $this->errorCode = $exception->getCode(); 61 | } 62 | 63 | return false; 64 | } 65 | 66 | /** 67 | * @param string $html 68 | * @return array{ 69 | * title: string, 70 | * description: string, 71 | * image: string, 72 | * icon: string, 73 | * locale: string 74 | * } 75 | */ 76 | private function extractTags(string $url, string $html): array 77 | { 78 | $urlParts = parse_url($url); 79 | $crawler = new Crawler(uri: $url, baseHref: $urlParts['scheme'] . '://' . $urlParts['host']); 80 | $crawler->addHtmlContent($html); 81 | 82 | $extracted = []; 83 | 84 | foreach ($this->getExtractors() as $extractor) { 85 | /** @var ExtractorInterface $extractor */ 86 | $extracted[$extractor::name()] = $extractor::extract($crawler); 87 | } 88 | 89 | return $extracted; 90 | 91 | } 92 | 93 | public function client(): ClientInterface 94 | { 95 | if (!$this->httpClient) { 96 | $this->httpClient = new Client([ 97 | 'http_errors' => false, 98 | 'headers' => [ 99 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36' 100 | ] 101 | ]); 102 | } 103 | 104 | return $this->httpClient; 105 | } 106 | 107 | /** 108 | * Use this method to explicitly pass your own instance of PSR-7 HTTP Client with Options. 109 | * @param ClientInterface $client 110 | * @return void 111 | */ 112 | public function setClient(ClientInterface $client): void 113 | { 114 | $this->httpClient = $client; 115 | } 116 | 117 | /** 118 | * @throws InvalidURLException 119 | */ 120 | public function validate(string $url): void 121 | { 122 | if (!filter_var($url, FILTER_VALIDATE_URL)) { 123 | throw new InvalidURLException($url); 124 | } 125 | } 126 | 127 | /** 128 | * @return string[] 129 | */ 130 | public function getExtractors(): array 131 | { 132 | return [ 133 | TitleExtractor::name() => TitleExtractor::class, 134 | DescriptionExtractor::name() => DescriptionExtractor::class, 135 | ImageExtractor::name() => ImageExtractor::class, 136 | FaviconExtractor::name() => FaviconExtractor::class, 137 | LocaleExtractor::name() => LocaleExtractor::class 138 | ]; 139 | } 140 | } 141 | --------------------------------------------------------------------------------