├── composer.json ├── LICENSE ├── README.md └── src └── WarcReader.php /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mixnode/mixnode-warcreader-php", 3 | "description": "Read Web ARChive (WARC) files in PHP.", 4 | "type": "library", 5 | "license": "Apache-2.0", 6 | "authors": [ 7 | { 8 | "name": "Nariman Jelveh", 9 | "email": "nj@mixnode.com" 10 | } 11 | ], 12 | "autoload": { 13 | "psr-4": { "Mixnode\\" : "src/" } 14 | }, 15 | "minimum-stability": "dev", 16 | "require": {} 17 | } 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Mixnode Technologies Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mixnode WARC Reader for PHP 2 | This library allows developers to read Web ARChive (WARC) files in PHP. 3 | 4 | ## Installation Guide 5 | 6 | We recommend [Composer](http://getcomposer.org) for installing this package: 7 | 8 | ```bash 9 | curl -sS https://getcomposer.org/installer | php 10 | ``` 11 | 12 | Once done, run the Composer command to install Mixnode WARC Reader for PHP: 13 | 14 | ```bash 15 | php composer.phar require mixnode/mixnode-warcreader-php 16 | ``` 17 | 18 | After installing, you need to require Composer's autoloader in your code: 19 | 20 | ```php 21 | require 'vendor/autoload.php'; 22 | ``` 23 | 24 | You can then later update Mixnode WARC Reader using composer: 25 | 26 | ```bash 27 | composer.phar update 28 | ``` 29 | 30 | ## A Simple Example 31 | 32 | ```php 33 | nextRecord()) != FALSE){ 42 | // A WARC record is broken into two parts: header and content. 43 | // header contains metadata about content, while content is the actual resource captured. 44 | print_r($record['header']); 45 | print_r($record['content']); 46 | echo "------------------------------------\n"; 47 | } 48 | ``` 49 | -------------------------------------------------------------------------------- /src/WarcReader.php: -------------------------------------------------------------------------------- 1 | fp = $fp; 36 | $this->warc_path = $warc_file_path; 37 | } 38 | } 39 | 40 | /** 41 | * Gets the next WARC record 42 | * 43 | */ 44 | public function nextRecord() 45 | { 46 | if(!@feof($this->fp)){ 47 | //stores warc header 48 | $warc_header = array(); 49 | //get first line of warc archive file 50 | $line = fgets($this->fp); 51 | //continue streaming file line by line until a newline is detected 52 | //newline means header of warc record is over 53 | while( $line != "\r\n" && !feof($this->fp)){ 54 | $split_parts = array(); 55 | //split this line from ': ' 56 | $split_parts = explode(": ", $line, 2); 57 | if(trim($split_parts[0]) == 'WARC/1.0' || trim($split_parts[0]) == 'WARC/1.1') 58 | @$warc_header['version'] = trim($split_parts[0]); 59 | else 60 | @$warc_header[trim($split_parts[0])] = trim($split_parts[1]); 61 | //read a line for next iteration 62 | $line = fgets($this->fp); 63 | } 64 | //read content block of this record 65 | $warc_content_block = fread($this->fp, $warc_header['Content-Length']); 66 | //every block is followed by two newlines, pass them 67 | fgets($this->fp); 68 | fgets($this->fp); 69 | 70 | //prepare and return array of header and content block 71 | $warc_record['header'] = $warc_header; 72 | $warc_record['content'] = $warc_content_block; 73 | return $warc_record; 74 | } 75 | else 76 | return FALSE; 77 | } 78 | 79 | function __destruct() { 80 | fclose($this->fp); 81 | } 82 | } 83 | --------------------------------------------------------------------------------