├── phpstan.neon
├── .gitignore
├── .github
├── dependabot.yml
└── workflows
│ └── php.yml
├── src
├── Contracts
│ ├── ParserInterface.php
│ └── ExtractorInterface.php
├── Exceptions
│ └── InvalidURLException.php
├── Model
│ └── Link.php
├── Client.php
├── Extractors
│ ├── LocaleExtractor.php
│ ├── ImageExtractor.php
│ ├── TitleExtractor.php
│ ├── FaviconExtractor.php
│ └── DescriptionExtractor.php
├── Traits
│ └── CanExtractBySelector.php
└── Parsers
│ └── SiteParser.php
├── .php-cs-fixer.php
├── phpunit.xml
├── tests
├── Unit
│ ├── Extractors
│ │ ├── LocaleExtractorTest.php
│ │ └── FaviconExtractor.php
│ ├── SiteParserTest.php
│ └── ClientTest.php
├── TestCase.php
├── Stubs
│ └── response.html
└── Pest.php
├── LICENSE
├── composer.json
└── README.md
/phpstan.neon:
--------------------------------------------------------------------------------
1 | parameters:
2 | level: 6
3 | paths:
4 | - src
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | vendor/
3 | .php-cs-fixer.cache
4 | composer.lock
5 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "composer"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 |
--------------------------------------------------------------------------------
/src/Contracts/ParserInterface.php:
--------------------------------------------------------------------------------
1 | in(["src", "tests"])
4 | ;
5 |
6 | $config = new PhpCsFixer\Config();
7 | return $config->setRules([
8 | '@PSR12' => true,
9 | 'array_syntax' => ['syntax' => 'short'],
10 | ])
11 | ->setFinder($finder)
12 | ;
--------------------------------------------------------------------------------
/src/Exceptions/InvalidURLException.php:
--------------------------------------------------------------------------------
1 |
2 |
7 |
8 |
9 | ./tests
10 |
11 |
12 |
13 |
14 | ./app
15 | ./src
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/Client.php:
--------------------------------------------------------------------------------
1 | parser = $parser ?? new SiteParser();
18 | }
19 |
20 | /**
21 | * @throws InvalidURLException
22 | */
23 | public function parse(string $url): Link
24 | {
25 | return $this->parser->parse($url);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/Extractors/LocaleExtractor.php:
--------------------------------------------------------------------------------
1 | > $selectors
15 | */
16 | private static array $selectors = [
17 | ['selector' => 'html', 'attribute' => 'lang'],
18 | ];
19 |
20 | public static function extract(Crawler $crawler): string
21 | {
22 | return self::extractSelectors($crawler);
23 | }
24 |
25 | public static function name(): string
26 | {
27 | return 'locale';
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/tests/Unit/Extractors/LocaleExtractorTest.php:
--------------------------------------------------------------------------------
1 | createSiteParserWithMockFakeHttpClient([
9 | new Response(body: '
')
10 | ]);
11 |
12 | $link = $parser->parse('https://hazaveh.net');
13 |
14 | assertEquals("en", $link->locale);
15 | });
16 |
17 | test('it handle page without locale', function () {
18 | $parser = $this->createSiteParserWithMockFakeHttpClient([
19 | new Response(body: '')
20 | ]);
21 |
22 | $link = $parser->parse('https://hazaveh.net');
23 |
24 | assertEquals("", $link->locale);
25 | });
26 |
--------------------------------------------------------------------------------
/src/Traits/CanExtractBySelector.php:
--------------------------------------------------------------------------------
1 | filter($selector['selector'])->count() > 0) {
20 | $data[] = isset($selector['attribute'])
21 | ? $crawler->filter($selector['selector'])->first()->attr($selector['attribute'])
22 | : $crawler->filter($selector['selector'])->first()->text();
23 | }
24 | }
25 |
26 | return $data[0] ?? "";
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/Extractors/ImageExtractor.php:
--------------------------------------------------------------------------------
1 | > $selectors
15 | */
16 | private static array $selectors = [
17 | ['selector' => 'meta[property="twitter:image"]', 'attribute' => 'content'],
18 | ['selector' => 'meta[property="og:image"]', 'attribute' => 'content'],
19 | ['selector' => 'meta[itemprop="image"]', 'attribute' => 'content'],
20 | ];
21 |
22 | public static function extract(Crawler $crawler): string
23 | {
24 | return self::extractSelectors($crawler);
25 | }
26 |
27 | public static function name(): string
28 | {
29 | return 'image';
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/.github/workflows/php.yml:
--------------------------------------------------------------------------------
1 | name: PHP Composer
2 |
3 | on:
4 | push:
5 | pull_request:
6 | branches: [ "master" ]
7 |
8 | permissions:
9 | contents: read
10 |
11 | jobs:
12 | build:
13 |
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v3
18 |
19 | - name: Validate composer.json and composer.lock
20 | run: composer validate --strict
21 |
22 | - name: Cache Composer packages
23 | id: composer-cache
24 | uses: actions/cache@v3
25 | with:
26 | path: vendor
27 | key: ${{ runner.os }}-php-${{ hashFiles('**/composer.lock') }}
28 | restore-keys: |
29 | ${{ runner.os }}-php-
30 |
31 | - name: Install dependencies
32 | run: composer install --prefer-dist --no-progress
33 |
34 | - name: Run static analyzer
35 | run: composer analyze
36 |
37 | - name: Run test suite
38 | run: composer test
39 |
40 | - name: Run Codestyle Checks
41 | run: composer codestyle
42 |
--------------------------------------------------------------------------------
/src/Extractors/TitleExtractor.php:
--------------------------------------------------------------------------------
1 | > $selectors
15 | */
16 | private static array $selectors = [
17 | ['selector' => 'meta[property="twitter:title"]', 'attribute' => 'content'],
18 | ['selector' => 'meta[property="og:title"]', 'attribute' => 'content'],
19 | ['selector' => 'meta[itemprop="name"]', 'attribute' => 'content'],
20 | ['selector' => 'title']
21 | ];
22 | public static function extract(Crawler $crawler): string
23 | {
24 | return self::extractSelectors($crawler);
25 | }
26 |
27 | public static function name(): string
28 | {
29 | return 'title';
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/tests/TestCase.php:
--------------------------------------------------------------------------------
1 | $handlerStack, 'http_errors' => false]);
24 |
25 | }
26 |
27 | /**
28 | * @param Response[] $responses
29 | * @return SiteParser
30 | */
31 | public function createSiteParserWithMockFakeHttpClient(array $responses): SiteParser
32 | {
33 | $parser = new SiteParser();
34 | $parser->setClient($this->createMockHttpClient($responses));
35 | return $parser;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/Extractors/FaviconExtractor.php:
--------------------------------------------------------------------------------
1 | > $selectors
15 | */
16 | private static array $selectors = [
17 | ['selector' => 'link[rel="icon"]', 'attribute' => 'href']
18 | ];
19 |
20 | public static function extract(Crawler $crawler): string
21 | {
22 | $icon = self::extractSelectors($crawler);
23 | // If the icon is a relative path, we need to convert it to an absolute path
24 | if (strpos($icon, 'http') !== 0) {
25 | $icon = $crawler->getUri() . $icon;
26 | }
27 | return preg_replace('/([^:])(\\/{2,})/', '$1/', $icon);
28 | }
29 |
30 | public static function name(): string
31 | {
32 | return 'icon';
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/Extractors/DescriptionExtractor.php:
--------------------------------------------------------------------------------
1 | > $selectors
15 | */
16 | private static array $selectors = [
17 | ['selector' => 'meta[property="twitter:description"]', 'attribute' => 'content'],
18 | ['selector' => 'meta[property="og:description"]', 'attribute' => 'content'],
19 | ['selector' => 'meta[itemprop="description"]', 'attribute' => 'content'],
20 | ['selector' => 'meta[name="description"]', 'attribute' => 'content'],
21 | ];
22 | public static function extract(Crawler $crawler): string
23 | {
24 | return self::extractSelectors($crawler);
25 | }
26 |
27 | public static function name(): string
28 | {
29 | return "description";
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Mahdi Hazaveh
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "hazaveh/php-link-preview",
3 | "description": "PHP Link Preview library",
4 | "type": "library",
5 | "require": {
6 | "php": ">=8.1",
7 | "symfony/dom-crawler": "^3.0|^4.0|^5.0|^6.0|^7.0",
8 | "guzzlehttp/guzzle": "^6.1|^7.1",
9 | "symfony/css-selector": "^3.0|^4.0|^5.0|^6.0|^7.0"
10 | },
11 | "require-dev": {
12 | "pestphp/pest": "^2.19",
13 | "friendsofphp/php-cs-fixer": "^3.28",
14 | "phpstan/phpstan": "^1.10"
15 | },
16 | "license": "MIT",
17 | "autoload": {
18 | "psr-4": {
19 | "Hazaveh\\LinkPreview\\": "src/"
20 | }
21 | },
22 | "autoload-dev": {
23 | "psr-4": {
24 | "Tests\\": "tests/"
25 | }
26 | },
27 | "authors": [
28 | {
29 | "name": "Mahdi",
30 | "email": "me@hazaveh.net"
31 | }
32 | ],
33 | "config": {
34 | "allow-plugins": {
35 | "pestphp/pest-plugin": true
36 | }
37 | },
38 | "scripts": {
39 | "test": "@php vendor/bin/pest",
40 | "analyze": "@php vendor/bin/phpstan analyze",
41 | "codestyle": "@php vendor/bin/php-cs-fixer fix --config=.php-cs-fixer.php --verbose --diff --dry-run"
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/tests/Unit/Extractors/FaviconExtractor.php:
--------------------------------------------------------------------------------
1 | createSiteParserWithMockFakeHttpClient([
7 | new Response(body: '')
8 | ]);
9 |
10 | $link = $parser->parse('https://hazaveh.net');
11 |
12 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico');
13 | });
14 |
15 | test('it handles absolute path', function () {
16 | $parser = $this->createSiteParserWithMockFakeHttpClient([
17 | new Response(body: '')
18 | ]);
19 |
20 | $link = $parser->parse('https://hazaveh.net');
21 |
22 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico');
23 | });
24 |
25 | test('it handles trailing slashes', function () {
26 | $parser = $this->createSiteParserWithMockFakeHttpClient([
27 | new Response(body: '')
28 | ]);
29 |
30 | $link = $parser->parse('https://hazaveh.net/');
31 |
32 | expect($link->icon)->toBe('https://hazaveh.net/favicon.ico');
33 | });
34 |
--------------------------------------------------------------------------------
/tests/Unit/SiteParserTest.php:
--------------------------------------------------------------------------------
1 | parse('http:/google.com');
11 | })->expectException(InvalidURLException::class);
12 |
13 | test('it can use a custom http client', function () {
14 | $httpClient = new \GuzzleHttp\Client();
15 | $parser = new \Hazaveh\LinkPreview\Parsers\SiteParser();
16 | $parser->setClient($httpClient);
17 | assertEquals($parser->client(), $httpClient);
18 | });
19 |
20 | test('it can parse correctly', function () {
21 | /** @var \Hazaveh\LinkPreview\Parsers\SiteParser $parser */
22 | $parser = $this->createSiteParserWithMockFakeHttpClient([
23 | new Response(body: file_get_contents(\Pest\testDirectory('Stubs/response.html')))
24 | ]);
25 |
26 | $link = $parser->parse('https://hazaveh.net');
27 |
28 | assertEquals("Your Page Title", $link->title);
29 | assertEquals("Your page description goes here.", $link->description);
30 | assertEquals("https://example.com/your-image.jpg", $link->image);
31 | assertEquals("https://example.com/favicon.ico", $link->icon);
32 | });
33 |
--------------------------------------------------------------------------------
/tests/Stubs/response.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | Your Page Title
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/tests/Pest.php:
--------------------------------------------------------------------------------
1 | in('Feature', 'Unit');
16 |
17 | /*
18 | |--------------------------------------------------------------------------
19 | | Expectations
20 | |--------------------------------------------------------------------------
21 | |
22 | | When you're writing tests, you often need to check that values meet certain conditions. The
23 | | "expect()" function gives you access to a set of "expectations" methods that you can use
24 | | to assert different things. Of course, you may extend the Expectation API at any time.
25 | |
26 | */
27 |
28 | expect()->extend('toBeOne', function () {
29 | return $this->toBe(1);
30 | });
31 |
32 | /*
33 | |--------------------------------------------------------------------------
34 | | Functions
35 | |--------------------------------------------------------------------------
36 | |
37 | | While Pest is very powerful out-of-the-box, you may have some testing code specific to your
38 | | project that you don't want to repeat in every file. Here you can also expose helpers as
39 | | global functions to help you to reduce the number of lines of code in your test files.
40 | |
41 | */
42 |
43 | function something()
44 | {
45 | // ..
46 | }
47 |
--------------------------------------------------------------------------------
/tests/Unit/ClientTest.php:
--------------------------------------------------------------------------------
1 | parser);
17 | });
18 |
19 | test('it accepts a custom parser', function () {
20 | class CustomParser implements ParserInterface
21 | {
22 | public function parse(string $url): Link
23 | {
24 | return new Link(url: $url);
25 | }
26 | }
27 |
28 | $client = new Client(new CustomParser());
29 |
30 | assertInstanceOf(CustomParser::class, $client->parser);
31 | });
32 |
33 | test('it can visit and parse a page', function () {
34 | $parser = $this->createSiteParserWithMockFakeHttpClient([
35 | new Response(body: file_get_contents(\Pest\testDirectory('Stubs/response.html')))
36 | ]);
37 |
38 | $client = new Client($parser);
39 |
40 | $url = "https://hazaveh.net";
41 | $link = $client->parse($url);
42 |
43 | assertInstanceOf(Link::class, $link);
44 |
45 | assertEquals($url, $link->url);
46 | });
47 |
48 | test('it handles http errors', function () {
49 | $url = "https://hazaveh.net";
50 |
51 | $parser = $this->createSiteParserWithMockFakeHttpClient([
52 | new Response(404, ['Content-Length' => 0]),
53 | new RequestException('Error Communicating with Server', new Request('GET', $url))
54 | ]);
55 |
56 | $client = new Client($parser);
57 |
58 | $link = $client->parse($url);
59 |
60 | assertInstanceOf(Link::class, $link);
61 |
62 | assertEquals(404, $link->error);
63 |
64 | $link = $client->parse($url);
65 |
66 | assertEquals(0, $link->error);
67 | });
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # PHP Link Preview
6 | PHP Link Preview is a small library that can crawl and return the OG & Meta tags of an URL. This can be used in your applications to display a preview of a URL similar to what happens when you paste a link in Social Media sites or Whatsapp.
7 |
8 | ### Current Information
9 | * `title`: open graph title, if not found page title will be returned
10 | * `description`: open graph description, if not found page description from meta tag is returned
11 | * `image`: open graph image
12 | * `icon`: favicon (if icon is explicitly specified in the HTML source)
13 |
14 | ## Dependencies
15 | * PHP >= 8.2
16 | * Guzzle >= 6
17 | * Symfony DomCrawler >= 3.0
18 | * Symfony CssSelector >= 3.0
19 |
20 | ## Installation
21 | Simply run via composer:
22 |
23 | composer require hazaveh/php-link-preview
24 |
25 | ## Usage
26 | Create an instance of Client and use `parse` method to crawl a URL.
27 | ```php
28 | use Hazaveh\LinkPreview\Client;
29 |
30 | require_once 'vendor/autoload.php';
31 |
32 | $client = new Client();
33 |
34 | /**
35 | * Returns an instance of Hazaveh\LinkPreview\Model\Link
36 | * {title, description, image, icon, locale}
37 | */
38 |
39 | $preview = $client->parse("https://hazaveh.net/2023/07/re-inventing-bookmarks-for-teams/");
40 | ```
41 |
42 | ## Using Custom Parser
43 | Out of the box this library comes with a Parser that uses included extractor classes to extract different pieces of information from the page. You can always use a custom parser that implements `ParserInterface` and have your own logic to extract information from the page.
44 |
45 | You are also free to use a Custom Link class which would then include additional information you might want to parse off the website during parsing process.
46 |
47 | ```php
48 | class CustomParser implements ParserInterface
49 | {
50 | public function parse(string $url): Link
51 | {
52 | return new Link(url: $url);
53 | }
54 | }
55 |
56 | $client = new Client(new CustomParser());
57 | ```
58 |
59 | ## Contribution
60 | Do something cool and add a PR.
--------------------------------------------------------------------------------
/src/Parsers/SiteParser.php:
--------------------------------------------------------------------------------
1 | validate($url);
36 | $html = $this->visit($url);
37 |
38 | if (!$html) {
39 | return new Link(url: $url, description: "Invalid response code {$this->errorCode}", error: $this->errorCode);
40 | }
41 |
42 | $data = $this->extractTags($url, $html);
43 |
44 | return new Link($url, $data['title'], $data['description'], $data['image'], $data['icon'], locale: $data['locale']);
45 |
46 | }
47 |
48 | private function visit(string $url): string | bool
49 | {
50 |
51 | try {
52 | /** @phpstan-ignore-next-line */
53 | $response = $this->client()->get($url);
54 |
55 | if ($response->getStatusCode() >= 200 && $response->getStatusCode() < 300) {
56 | return $response->getBody()->getContents();
57 | }
58 | $this->errorCode = $response->getStatusCode();
59 | } catch (GuzzleException $exception) {
60 | $this->errorCode = $exception->getCode();
61 | }
62 |
63 | return false;
64 | }
65 |
66 | /**
67 | * @param string $html
68 | * @return array{
69 | * title: string,
70 | * description: string,
71 | * image: string,
72 | * icon: string,
73 | * locale: string
74 | * }
75 | */
76 | private function extractTags(string $url, string $html): array
77 | {
78 | $urlParts = parse_url($url);
79 | $crawler = new Crawler(uri: $url, baseHref: $urlParts['scheme'] . '://' . $urlParts['host']);
80 | $crawler->addHtmlContent($html);
81 |
82 | $extracted = [];
83 |
84 | foreach ($this->getExtractors() as $extractor) {
85 | /** @var ExtractorInterface $extractor */
86 | $extracted[$extractor::name()] = $extractor::extract($crawler);
87 | }
88 |
89 | return $extracted;
90 |
91 | }
92 |
93 | public function client(): ClientInterface
94 | {
95 | if (!$this->httpClient) {
96 | $this->httpClient = new Client([
97 | 'http_errors' => false,
98 | 'headers' => [
99 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
100 | ]
101 | ]);
102 | }
103 |
104 | return $this->httpClient;
105 | }
106 |
107 | /**
108 | * Use this method to explicitly pass your own instance of PSR-7 HTTP Client with Options.
109 | * @param ClientInterface $client
110 | * @return void
111 | */
112 | public function setClient(ClientInterface $client): void
113 | {
114 | $this->httpClient = $client;
115 | }
116 |
117 | /**
118 | * @throws InvalidURLException
119 | */
120 | public function validate(string $url): void
121 | {
122 | if (!filter_var($url, FILTER_VALIDATE_URL)) {
123 | throw new InvalidURLException($url);
124 | }
125 | }
126 |
127 | /**
128 | * @return string[]
129 | */
130 | public function getExtractors(): array
131 | {
132 | return [
133 | TitleExtractor::name() => TitleExtractor::class,
134 | DescriptionExtractor::name() => DescriptionExtractor::class,
135 | ImageExtractor::name() => ImageExtractor::class,
136 | FaviconExtractor::name() => FaviconExtractor::class,
137 | LocaleExtractor::name() => LocaleExtractor::class
138 | ];
139 | }
140 | }
141 |
--------------------------------------------------------------------------------