├── ffi
    ├── .gitignore
    ├── Cargo.toml
    ├── src
    │   ├── types.rs
    │   ├── metadata.rs
    │   ├── stream.rs
    │   ├── errors.rs
    │   ├── lib.rs
    │   ├── config.rs
    │   └── extractor.rs
    ├── examples
    │   ├── basic_extraction.c
    │   ├── pdf_with_ocr.c
    │   ├── streaming_extraction.c
    │   └── README.md
    ├── cbindgen.toml
    └── build.rs
├── .prettierrc
├── tests
    ├── ffi
    │   ├── test_ffi_interface
    │   ├── Makefile
    │   └── test_ffi_interface.c
    ├── README.md
    └── go
    │   └── integration_test.go
├── copy.sh
├── go.mod
├── .gitignore
├── examples
    ├── basic
    │   └── main.go
    ├── streaming
    │   └── main.go
    └── config
    │   └── main.go
├── go.sum
├── cgo.go
├── .github
    └── workflows
    │   ├── scripts
    │       ├── build-local.sh
    │       └── collect-libs.sh
    │   ├── release.yml
    │   └── build.yml
├── check_native_libs.go
├── stream.go
├── metadata.go
├── README.md
├── errors.go
├── LICENSE
├── types.go
├── extractous.h
└── cmd
    └── install
        └── main.go


/ffi/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |     "tabWidth": 2
3 | }
4 | 


--------------------------------------------------------------------------------
/tests/ffi/test_ffi_interface:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rahulpoonia29/extractous-go/HEAD/tests/ffi/test_ffi_interface


--------------------------------------------------------------------------------
/copy.sh:
--------------------------------------------------------------------------------
1 | cp /mnt/c/Users/Rahul\ Poonia/Downloads/extractous-ffi-linux_amd64.zip ~/dev/extractous-go/
2 | 
3 | unzip extractous-ffi-linux_amd64.zip -d extractous-ffi-linux_amd64
4 | 
5 | cd extractous-ffi-linux_amd64
6 | 
7 | cp ./lib ../../benchmark/native/linux_amd64/lib -r
8 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/rahulpoonia29/extractous-go
 2 | 
 3 | go 1.25.1
 4 | 
 5 | require (
 6 | 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
 7 | 	github.com/rivo/uniseg v0.4.7 // indirect
 8 | 	github.com/schollz/progressbar/v3 v3.18.0 // indirect
 9 | 	golang.org/x/sys v0.37.0 // indirect
10 | 	golang.org/x/term v0.36.0 // indirect
11 | )
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and headers
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | ffi/*.h
 8 | native/*
 9 | dist/
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Dependency directories
15 | native/
16 | 
17 | # Go workspace file
18 | go.work
19 | 
20 | # FFI build artifacts
21 | target/
22 | *.a
23 | *.o
24 | 
25 | # OS-specific files
26 | .DS_Store
27 | Thumbs.db
28 | 
29 | # IDE files
30 | .vscode/
31 | .idea/
32 | 
33 | # Logs
34 | *.log
35 | 
36 | # Temporary files
37 | *.tmp
38 | *.swp
39 | 


--------------------------------------------------------------------------------
/ffi/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "extractous-ffi"
 3 | version = "0.1.2"
 4 | edition = "2024"
 5 | rust-version = "1.90"
 6 | authors = ["Rahul"]
 7 | description = "C FFI bindings for extractous document extraction library"
 8 | license = "Apache-2.0"
 9 | repository = "https://github.com/rahulpoonia29/extractous-go"
10 | publish = false
11 | 
12 | [lib]
13 | name = "extractous_ffi"
14 | crate-type = ["cdylib"]
15 | 
16 | [dependencies]
17 | extractous = "0.3.0"
18 | libc = "0.2"
19 | 
20 | [build-dependencies]
21 | cbindgen = "0.29"
22 | 
23 | [profile.release]
24 | opt-level = 3
25 | lto = true
26 | codegen-units = 1
27 | strip = false
28 | panic = "abort"
29 | 
30 | [profile.dev]
31 | opt-level = 0
32 | 


--------------------------------------------------------------------------------
/examples/basic/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/rahulpoonia29/extractous-go"
 8 | )
 9 | 
10 | func main() {
11 | 	// Create a new extractor
12 | 	extractor := extractous.New()
13 | 	if extractor == nil {
14 | 		log.Fatal("Failed to create extractor")
15 | 	}
16 | 	defer extractor.Close()
17 | 
18 | 	// Extract text from a PDF file
19 | 	content, metadata, err := extractor.ExtractFileToString("sample.pdf")
20 | 	if err != nil {
21 | 		log.Fatalf("Extraction failed: %v", err)
22 | 	}
23 | 
24 | 	// Print the extracted content
25 | 	fmt.Println("Extracted Content:")
26 | 	fmt.Println(content)
27 | 
28 | 	// Print metadata
29 | 	fmt.Println("\nMetadata:")
30 | 	for key, values := range metadata {
31 | 		fmt.Printf("%s: %v\n", key, values)
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
 2 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
 3 | github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 4 | github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 5 | github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
 6 | github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
 7 | golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
 8 | golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 9 | golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
10 | golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
11 | 


--------------------------------------------------------------------------------
/examples/streaming/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"log"
 7 | 
 8 | 	"github.com/rahulpoonia29/extractous-go"
 9 | )
10 | 
11 | func main() {
12 | 	// Create a new extractor
13 | 	extractor := extractous.New()
14 | 	if extractor == nil {
15 | 		log.Fatal("Failed to create extractor")
16 | 	}
17 | 	defer extractor.Close()
18 | 
19 | 	// Extract to a stream for large files
20 | 	reader, metadata, err := extractor.ExtractFile("large_document.pdf")
21 | 	if err != nil {
22 | 		log.Fatalf("Extraction failed: %v", err)
23 | 	}
24 | 	defer reader.Close()
25 | 
26 | 	// Read and print in chunks
27 | 	buffer := make([]byte, 4096)
28 | 	for {
29 | 		n, err := reader.Read(buffer)
30 | 		if err != nil && err != io.EOF {
31 | 			log.Fatalf("Read failed: %v", err)
32 | 		}
33 | 		if n == 0 {
34 | 			break
35 | 		}
36 | 		fmt.Print(string(buffer[:n]))
37 | 	}
38 | 
39 | 	// Print metadata
40 | 	fmt.Println("\nMetadata:")
41 | 	for key, values := range metadata {
42 | 		fmt.Printf("%s: %v\n", key, values)
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/ffi/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for FFI Tests
 2 | 
 3 | CC = gcc
 4 | CFLAGS = -Wall -Wextra -I../../include
 5 | LDFLAGS = \
 6 |     -L../../extractous-ffi/target/release \
 7 |     -L../../native/$(shell uname -s | tr '[:upper:]' '[:lower:]')_$(shell uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') \
 8 |     -Wl,-rpath,../../extractous-ffi/target/release \
 9 |     -Wl,-rpath,../../native/$(shell uname -s | tr '[:upper:]' '[:lower:]')_$(shell uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/')
10 | LIBS = -lextractous_ffi -ldl -lm -lpthread
11 | 
12 | TEST_BINS = test_ffi_interface
13 | 
14 | .PHONY: all clean run
15 | 
16 | all: $(TEST_BINS)
17 | 
18 | test_ffi_interface: test_ffi_interface.c
19 | 	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) $(LIBS)
20 | 
21 | run: all
22 | 	@echo "Running FFI interface tests..."
23 | 	@./test_ffi_interface
24 | 
25 | clean:
26 | 	rm -f $(TEST_BINS)
27 | 	rm -f *.o
28 | 
29 | help:
30 | 	@echo "FFI Test Suite"
31 | 	@echo ""
32 | 	@echo "Targets:"
33 | 	@echo "  all     - Build all test binaries"
34 | 	@echo "  run     - Build and run all tests"
35 | 	@echo "  clean   - Remove test binaries"
36 | 	@echo "  help    - Show this help message"
37 | 


--------------------------------------------------------------------------------
/ffi/src/types.rs:
--------------------------------------------------------------------------------
 1 | use std::os::raw::{c_char, c_int};
 2 | 
 3 | #[repr(C)]
 4 | pub struct CExtractor {
 5 |     _private: [u8; 0],
 6 | }
 7 | #[repr(C)]
 8 | pub struct CStreamReader {
 9 |     _private: [u8; 0],
10 | }
11 | #[repr(C)]
12 | pub struct CPdfParserConfig {
13 |     _private: [u8; 0],
14 | }
15 | #[repr(C)]
16 | pub struct COfficeParserConfig {
17 |     _private: [u8; 0],
18 | }
19 | #[repr(C)]
20 | pub struct CTesseractOcrConfig {
21 |     _private: [u8; 0],
22 | }
23 | 
24 | #[repr(C)]
25 | pub struct CMetadata {
26 |     /// Array of pointers to null-terminated key strings
27 |     pub keys: *mut *mut c_char,
28 |     /// Array of pointers to null-terminated value strings
29 |     pub values: *mut *mut c_char,
30 |     /// The number of key-value pairs in the arrays
31 |     pub len: libc::size_t,
32 | }
33 | 
34 | pub const CHARSET_UTF_8: c_int = 0;
35 | pub const CHARSET_US_ASCII: c_int = 1;
36 | pub const CHARSET_UTF_16BE: c_int = 3;
37 | 
38 | pub const PDF_OCR_STRATEGY_NO_OCR: c_int = 0;
39 | pub const PDF_OCR_STRATEGY_OCR_ONLY: c_int = 1;
40 | pub const PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION: c_int = 2;
41 | pub const PDF_OCR_STRATEGY_AUTO: c_int = 3;
42 | 


--------------------------------------------------------------------------------
/examples/config/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/rahulpoonia29/extractous-go"
 8 | )
 9 | 
10 | func main() {
11 | 	// Create PDF config with OCR
12 | 	pdfConfig := extractous.NewPdfConfig().
13 | 		SetOcrStrategy(extractous.PdfOcrAuto).
14 | 		SetExtractAnnotationText(true)
15 | 
16 | 	// Create OCR config
17 | 	ocrConfig := extractous.NewOcrConfig().
18 | 		SetLanguage("eng").
19 | 		SetTimeoutSeconds(60)
20 | 
21 | 	// Create extractor with configurations
22 | 	extractor := extractous.New().
23 | 		SetExtractStringMaxLength(50000).
24 | 		SetEncoding(extractous.CharSetUTF8).
25 | 		SetPdfConfig(pdfConfig).
26 | 		SetOcrConfig(ocrConfig)
27 | 	if extractor == nil {
28 | 		log.Fatal("Failed to create configured extractor")
29 | 	}
30 | 	defer extractor.Close()
31 | 
32 | 	// Extract from a URL
33 | 	content, metadata, err := extractor.ExtractURLToString("https://example.com/sample.pdf")
34 | 	if err != nil {
35 | 		log.Fatalf("Extraction failed: %v", err)
36 | 	}
37 | 
38 | 	fmt.Println("Extracted Content:")
39 | 	fmt.Println(content)
40 | 
41 | 	fmt.Println("\nMetadata:")
42 | 	for key, values := range metadata {
43 | 		fmt.Printf("%s: %v\n", key, values)
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/ffi/examples/basic_extraction.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Basic extraction example
 3 |  * 
 4 |  * Demonstrates how to:
 5 |  * - Create an extractor
 6 |  * - Extract text from a file
 7 |  * - Access metadata
 8 |  * - Proper memory cleanup
 9 |  */
10 | 
11 | #include <extractous.h> // Replace with the correct path to extractous.h
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | 
15 | int main() {
16 |     CExtractor* ext = extractous_extractor_new();
17 |     if (!ext) {
18 |         fprintf(stderr, "Failed to create extractor\n");
19 |         return 1;
20 |     }
21 |     
22 |     char* content;
23 |     CMetadata* metadata;
24 |     
25 |     int result = extractous_extractor_extract_file_to_string(
26 |         ext, "document.pdf", &content, &metadata
27 |     );
28 |     
29 |     if (result == ERR_OK) {
30 |         printf("Content: %s\n", content);
31 |         
32 |         for (size_t i = 0; i < metadata->len; i++) {
33 |             printf("%s: %s\n", metadata->keys[i], metadata->values[i]);
34 |         }
35 |         
36 |         extractous_string_free(content);
37 |         extractous_metadata_free(metadata);
38 |     } else {
39 |         char* msg = extractous_error_message(result);
40 |         fprintf(stderr, "Error: %s\n", msg);
41 |         extractous_string_free(msg);
42 |     }
43 |     
44 |     extractous_extractor_free(ext);
45 |     return 0;
46 | }
47 | 


--------------------------------------------------------------------------------
/cgo.go:
--------------------------------------------------------------------------------
 1 | //go:build windows || darwin || linux
 2 | // +build windows darwin linux
 3 | 
 4 | //go:generate go run check_native_libs.go
 5 | 
 6 | package extractous
 7 | 
 8 | /*
 9 | // Linux
10 | #cgo linux,amd64 CFLAGS: -I${SRCDIR}
11 | #cgo linux,amd64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread
12 | #cgo linux,arm64 CFLAGS: -I${SRCDIR}
13 | #cgo linux,arm64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread
14 | 
15 | // macOS
16 | #cgo darwin,amd64 CFLAGS: -I${SRCDIR}
17 | #cgo darwin,amd64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread
18 | #cgo darwin,arm64 CFLAGS: -I${SRCDIR}
19 | #cgo darwin,arm64 LDFLAGS: -lextractous_ffi -ldl -lm -lpthread
20 | 
21 | // Windows
22 | #cgo windows,amd64 CFLAGS: -I${SRCDIR}
23 | #cgo windows,amd64 LDFLAGS: -lextractous_ffi
24 | 
25 | 
26 | // Include the generated header
27 | #include "extractous.h"
28 | */
29 | import "C"
30 | import (
31 | 	"runtime"
32 | 	"unsafe"
33 | )
34 | 
35 | // init locks the OS thread for JNI compatibility and library initialization.
36 | // The constructor functions above run BEFORE this init() is called.
37 | func init() {
38 | 	runtime.LockOSThread()
39 | }
40 | 
41 | // Helper Functions for C Interop
42 | func cString(s string) *C.char {
43 | 	return C.CString(s)
44 | }
45 | 
46 | func goString(cs *C.char) string {
47 | 	return C.GoString(cs)
48 | }
49 | 
50 | func freeString(cs *C.char) {
51 | 	C.free(unsafe.Pointer(cs))
52 | }
53 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build-local.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Local build script for extractous-ffi
 5 | # Usage: ./scripts/build-local.sh [target]
 6 | 
 7 | # Detect platform
 8 | if [ -z "$1" ]; then
 9 |     case "$(uname -s)" in
10 |         Linux*)     TARGET="x86_64-unknown-linux-gnu"; LIB_EXT="so"; PLATFORM="linux_amd64";;
11 |         Darwin*)    
12 |             if [ "$(uname -m)" = "arm64" ]; then
13 |                 TARGET="aarch64-apple-darwin"; PLATFORM="darwin_arm64"
14 |             else
15 |                 TARGET="x86_64-apple-darwin"; PLATFORM="darwin_amd64"
16 |             fi
17 |             LIB_EXT="dylib"
18 |             ;;
19 |         MINGW*|MSYS*|CYGWIN*) TARGET="x86_64-pc-windows-msvc"; LIB_EXT="dll"; PLATFORM="windows_amd64";;
20 |         *)          echo "Unknown platform"; exit 1;;
21 |     esac
22 | else
23 |     TARGET=$1
24 |     # Detect lib_ext and platform from target
25 | fi
26 | 
27 | echo "Building for: $TARGET"
28 | echo "Platform: $PLATFORM"
29 | 
30 | # Check for GraalVM
31 | if [ -z "$GRAALVM_HOME" ] && [ -z "$JAVA_HOME" ]; then
32 |     echo "Error: GRAALVM_HOME or JAVA_HOME must be set"
33 |     echo "Install GraalVM 23+ with native-image"
34 |     exit 1
35 | fi
36 | 
37 | # Build
38 | cd ffi
39 | cargo build --release --target "$TARGET"
40 | cd ..
41 | 
42 | # Collect libraries
43 | ./scripts/collect-libs.sh "$PLATFORM" "$TARGET" "$LIB_EXT"
44 | 
45 | echo ""
46 | echo "✓ Build complete!"
47 | echo "Distribution: dist/$PLATFORM/"
48 | echo ""
49 | echo "To use in Go:"
50 | echo "  export CGO_CFLAGS=\"-I$(pwd)/dist/$PLATFORM/include\""
51 | echo "  export CGO_LDFLAGS=\"-L$(pwd)/dist/$PLATFORM/lib -lextractous_ffi\""
52 | echo "  export LD_LIBRARY_PATH=\"$(pwd)/dist/$PLATFORM/lib\" # Linux"
53 | echo "  export DYLD_LIBRARY_PATH=\"$(pwd)/dist/$PLATFORM/lib\" # macOS"
54 | 


--------------------------------------------------------------------------------
/ffi/examples/pdf_with_ocr.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * PDF with OCR example
 3 |  * 
 4 |  * Demonstrates how to:
 5 |  * - Configure PDF parser for OCR
 6 |  * - Set OCR language and parameters
 7 |  * - Extract from scanned PDFs
 8 |  */
 9 | 
10 | #include <extractous.h>
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | int main() {
15 |     // PDF config
16 |     CPdfParserConfig* pdf = extractous_pdf_config_new();
17 |     if (!pdf) {
18 |         fprintf(stderr, "Failed to create PDF config\n");
19 |         return 1;
20 |     }
21 |     pdf = extractous_pdf_config_set_ocr_strategy(pdf, PDF_OCR_STRATEGY_AUTO);
22 |     pdf = extractous_pdf_config_set_extract_annotation_text(pdf, true);
23 |     
24 |     // OCR config
25 |     CTesseractOcrConfig* ocr = extractous_ocr_config_new();
26 |     if (!ocr) {
27 |         fprintf(stderr, "Failed to create OCR config\n");
28 |         extractous_pdf_config_free(pdf);
29 |         return 1;
30 |     }
31 |     ocr = extractous_ocr_config_set_language(ocr, "eng");
32 |     ocr = extractous_ocr_config_set_density(ocr, 300);
33 |     
34 |     // Extractor
35 |     CExtractor* ext = extractous_extractor_new();
36 |     if (!ext) {
37 |         fprintf(stderr, "Failed to create extractor\n");
38 |         extractous_ocr_config_free(ocr);
39 |         extractous_pdf_config_free(pdf);
40 |         return 1;
41 |     }
42 |     ext = extractous_extractor_set_pdf_config(ext, pdf);  // Consumes pdf
43 |     ext = extractous_extractor_set_ocr_config(ext, ocr);  // Consumes ocr
44 |     
45 |     // Extract
46 |     char* content;
47 |     CMetadata* metadata;
48 |     int result = extractous_extractor_extract_file_to_string(
49 |         ext, "document.pdf", &content, &metadata
50 |     );
51 |     
52 |     if (result == ERR_OK) {
53 |         printf("Content: %s\n", content);
54 |         extractous_string_free(content);
55 |         extractous_metadata_free(metadata);
56 |     } else {
57 |         char* msg = extractous_error_message(result);
58 |         fprintf(stderr, "Error: %s\n", msg);
59 |         extractous_string_free(msg);
60 |     }
61 |     
62 |     extractous_extractor_free(ext);
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/ffi/examples/streaming_extraction.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Streaming extraction example
 3 |  * 
 4 |  * Demonstrates how to:
 5 |  * - Extract large files using streaming
 6 |  * - Process content in chunks
 7 |  * - Avoid loading entire file into memory
 8 |  */
 9 | 
10 | #include <extractous.h>
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | #define BUFFER_SIZE 4096
15 | 
16 | int main(int argc, char** argv) {
17 |     if (argc != 2) {
18 |         fprintf(stderr, "Usage: %s <file_path>\n", argv[0]);
19 |         return 1;
20 |     }
21 | 
22 |     const char* file_path = argv[1];
23 |     
24 |     // Create extractor
25 |     CExtractor* extractor = extractous_extractor_new();
26 |     if (!extractor) {
27 |         fprintf(stderr, "Failed to create extractor\n");
28 |         return 1;
29 |     }
30 |     
31 |     // Extract to stream
32 |     CStreamReader* reader = NULL;
33 |     CMetadata* metadata = NULL;
34 |     
35 |     printf("Streaming extraction from: %s\n", file_path);
36 |     
37 |     int err = extractous_extractor_extract_file(
38 |         extractor,
39 |         file_path,
40 |         &reader,
41 |         &metadata
42 |     );
43 |     
44 |     if (err != ERR_OK) {
45 |         char* error_msg = extractous_error_message(err);
46 |         fprintf(stderr, "Failed to start extraction (code %d): %s\n", err, error_msg);
47 |         extractous_string_free(error_msg);
48 |         extractous_extractor_free(extractor);
49 |         return 1;
50 |     }
51 |     
52 |     // Print metadata first
53 |     printf("\n=== Metadata (%zu entries) ===\n", metadata->len);
54 |     for (size_t i = 0; i < metadata->len; i++) {
55 |         printf("%s: %s\n", metadata->keys[i], metadata->values[i]);
56 |     }
57 |     
58 |     // Stream content in chunks
59 |     printf("\n=== Content ===\n");
60 |     
61 |     char buffer[BUFFER_SIZE];
62 |     size_t bytes_read;
63 |     size_t total_bytes = 0;
64 |     
65 |     while (extractous_stream_read(reader, (uint8_t*)buffer, BUFFER_SIZE, &bytes_read) == ERR_OK 
66 |            && bytes_read > 0) {
67 |         // Process chunk (here we just print it)
68 |         fwrite(buffer, 1, bytes_read, stdout);
69 |         total_bytes += bytes_read;
70 |     }
71 |     
72 |     printf("\n\n=== Summary ===\n");
73 |     printf("Total bytes read: %zu\n", total_bytes);
74 |     
75 |     // Cleanup
76 |     extractous_stream_free(reader);
77 |     extractous_metadata_free(metadata);
78 |     extractous_extractor_free(extractor);
79 |     
80 |     printf("Streaming extraction successful!\n");
81 |     return 0;
82 | }
83 | 


--------------------------------------------------------------------------------
/check_native_libs.go:
--------------------------------------------------------------------------------
 1 | //go:build ignore
 2 | 
 3 | package main
 4 | 
 5 | import (
 6 | 	"fmt"
 7 | 	"os"
 8 | 	"path/filepath"
 9 | 	"runtime"
10 | )
11 | 
12 | func main() {
13 | 	// This script is run by `go generate` to check if the native libraries
14 | 	// required for CGO exist. If they don't, it prints a helpful error
15 | 	// message to guide the user.
16 | 
17 | 	_, currentFile, _, ok := runtime.Caller(0)
18 | 	if !ok {
19 | 		fmt.Fprintln(os.Stderr, "Error: Cannot determine file path. Please run the installer.")
20 | 		os.Exit(1)
21 | 	}
22 | 
23 | 	projectRoot := filepath.Dir(currentFile)
24 | 	nativeDir := filepath.Join(projectRoot, "native")
25 | 
26 | 	// Check if native directory exists
27 | 	if _, err := os.Stat(nativeDir); os.IsNotExist(err) {
28 | 		printError()
29 | 		os.Exit(1)
30 | 	}
31 | 
32 | 	// Check for the platform-specific library directory
33 | 	platform := fmt.Sprintf("%s_%s", runtime.GOOS, runtime.GOARCH)
34 | 	libDir := filepath.Join(nativeDir, platform)
35 | 
36 | 	if _, err := os.Stat(libDir); os.IsNotExist(err) {
37 | 		fmt.Fprintf(os.Stderr, "\n")
38 | 		fmt.Fprintf(os.Stderr, "Error: Native libraries not found for %s!\n", platform)
39 | 		fmt.Fprintf(os.Stderr, "Expected library directory: %s\n", libDir)
40 | 		fmt.Fprintf(os.Stderr, "\n")
41 | 		printError()
42 | 		os.Exit(1)
43 | 	}
44 | 
45 | 	// Verify the actual library file exists
46 | 	var libFile string
47 | 	switch runtime.GOOS {
48 | 	case "windows":
49 | 		libFile = filepath.Join(libDir, "extractous_ffi.dll")
50 | 	case "darwin":
51 | 		libFile = filepath.Join(libDir, "libextractous_ffi.dylib")
52 | 	default: // linux
53 | 		libFile = filepath.Join(libDir, "libextractous_ffi.so")
54 | 	}
55 | 
56 | 	if _, err := os.Stat(libFile); os.IsNotExist(err) {
57 | 		fmt.Fprintf(os.Stderr, "Error: Library file not found: %s\n", libFile)
58 | 		printError()
59 | 		os.Exit(1)
60 | 	}
61 | 
62 | 	// Success - libraries are present
63 | 	fmt.Printf("✓ Native libraries verified for %s\n", platform)
64 | 	fmt.Printf("  Library: %s\n", libFile)
65 | }
66 | 
67 | func printError() {
68 | 	fmt.Fprintln(os.Stderr, "Error: Native FFI libraries not found!")
69 | 	fmt.Fprintln(os.Stderr, "This project uses CGO and requires pre-compiled native libraries")
70 | 	fmt.Fprintln(os.Stderr, "that were not found in the 'native/' directory.")
71 | 	fmt.Fprintln(os.Stderr, "")
72 | 	fmt.Fprintln(os.Stderr, "To fix this, please run the installer command from your project root:")
73 | 	fmt.Fprintln(os.Stderr, "")
74 | 	fmt.Fprintln(os.Stderr, "  go run github.com/rahulpoonia29/extractous-go/cmd/install@latest")
75 | 	fmt.Fprintln(os.Stderr, "")
76 | 	fmt.Fprintln(os.Stderr, "This will download the correct libraries for your platform.")
77 | 	fmt.Fprintln(os.Stderr, "After running the installer, try your build again.")
78 | }
79 | 


--------------------------------------------------------------------------------
/ffi/src/metadata.rs:
--------------------------------------------------------------------------------
 1 | use crate::types::CMetadata;
 2 | use std::collections::HashMap;
 3 | use std::ffi::CString;
 4 | use std::os::raw::c_char;
 5 | use std::ptr;
 6 | 
 7 | /// Convert a Rust HashMap to a C-compatible metadata structure.
 8 | pub(crate) unsafe fn metadata_to_c(metadata: HashMap<String, Vec<String>>) -> *mut CMetadata {
 9 |     if metadata.is_empty() {
10 |         return Box::into_raw(Box::new(CMetadata {
11 |             keys: ptr::null_mut(),
12 |             values: ptr::null_mut(),
13 |             len: 0,
14 |         }));
15 |     }
16 | 
17 |     let capacity = metadata.len();
18 |     let mut keys: Vec<*mut c_char> = Vec::with_capacity(capacity);
19 |     let mut values: Vec<*mut c_char> = Vec::with_capacity(capacity);
20 | 
21 |     for (key, value_vec) in metadata {
22 |         // CString::new will return an error if the string contains `\0`.
23 |         let c_key = match CString::new(key) {
24 |             Ok(s) => s.into_raw(),
25 |             Err(_) => continue, // Skip metadata with invalid keys.
26 |         };
27 | 
28 |         let joined_values = value_vec.join(", ");
29 |         let c_value = match CString::new(joined_values) {
30 |             Ok(s) => s.into_raw(),
31 |             Err(_) => {
32 |                 // Clean up the already-allocated key if the value is invalid.
33 |                 let _ = unsafe { CString::from_raw(c_key) };
34 |                 continue;
35 |             }
36 |         };
37 | 
38 |         keys.push(c_key);
39 |         values.push(c_value);
40 |     }
41 | 
42 |     // Final length is derived from the vectors after they are populated.
43 |     // Guarantees that the length matches the number of allocated pointers.
44 |     let final_len = keys.len();
45 |     assert_eq!(final_len, values.len());
46 | 
47 |     if final_len == 0 {
48 |         return Box::into_raw(Box::new(CMetadata {
49 |             keys: ptr::null_mut(),
50 |             values: ptr::null_mut(),
51 |             len: 0,
52 |         }));
53 |     }
54 | 
55 |     keys.shrink_to_fit();
56 |     values.shrink_to_fit();
57 | 
58 |     let keys_ptr = keys.as_mut_ptr();
59 |     let values_ptr = values.as_mut_ptr();
60 |     std::mem::forget(keys);
61 |     std::mem::forget(values);
62 | 
63 |     Box::into_raw(Box::new(CMetadata {
64 |         keys: keys_ptr,
65 |         values: values_ptr,
66 |         len: final_len,
67 |     }))
68 | }
69 | 
70 | /// Frees a metadata structure and all associated memory.
71 | #[unsafe(no_mangle)]
72 | pub unsafe extern "C" fn extractous_metadata_free(metadata: *mut CMetadata) {
73 |     if metadata.is_null() {
74 |         return;
75 |     }
76 | 
77 |     // Take ownership of CMetadata struct.
78 |     let m = unsafe { Box::from_raw(metadata) };
79 | 
80 |     let keys_vec = unsafe { Vec::from_raw_parts(m.keys, m.len, m.len) };
81 |     let values_vec = unsafe { Vec::from_raw_parts(m.values, m.len, m.len) };
82 | 
83 |     // Drop to free the memory for each CString.
84 |     for key_ptr in keys_vec {
85 |         let _ = unsafe { CString::from_raw(key_ptr) };
86 |     }
87 | 
88 |     for value_ptr in values_vec {
89 |         let _ = unsafe { CString::from_raw(value_ptr) };
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/ffi/cbindgen.toml:
--------------------------------------------------------------------------------
  1 | header = """
  2 | /* 
  3 |  * Extractous FFI - C Interface
  4 |  * 
  5 |  * This header file provides a C-compatible interface to the Extractous
  6 |  * document extraction library. It is safe for use with Go via cgo or any
  7 |  * C-compatible FFI system.
  8 |  *
  9 |  * License: Apache-2.0
 10 |  * Repository: https://github.com/rahulpoonia229/extractous-go
 11 |  *
 12 |  * MEMORY MANAGEMENT:
 13 |  * All pointers returned by Extractous functions must be freed using the function extractous_free_string.
 14 |  * Failure to do so will result in memory leaks.
 15 |  *
 16 |  *
 17 |  * CGO USAGE:
 18 |  *   // #cgo CFLAGS: -I${SRCDIR}/include
 19 |  *   // #cgo LDFLAGS: -L${SRCDIR}/lib -lextractous_ffi
 20 |  *   // #cgo linux LDFLAGS: -Wl,-rpath,$ORIGIN
 21 |  *   // #cgo darwin LDFLAGS: -Wl,-rpath,@loader_path
 22 |  *   // #include "extractous.h"
 23 |  *   import "C"
 24 |  */
 25 | """
 26 | 
 27 | # Generate pure C header (no C++)
 28 | language = "C"
 29 | style = "both"          # Both function declarations and typedef definitions
 30 | cpp_compat = false      # No C++ compatibility
 31 | 
 32 | # Includes
 33 | include_guard = "EXTRACTOUS_H"  # Add header guard
 34 | sys_includes = []               # No system includes needed
 35 | includes = []                   # No additional includes
 36 | 
 37 | # Warnings
 38 | documentation = true            # Include Rust doc comments
 39 | documentation_style = "c"       # Use C-style /** */ comments
 40 | 
 41 | [export]
 42 | include = []                    # Include all public items
 43 | exclude = []                    # No exclusions
 44 | prefix = ""                     # No prefix for function names
 45 | item_types = [
 46 |   "globals",
 47 |   "enums",
 48 |   "structs",
 49 |   "unions",
 50 |   "typedefs",
 51 |   "opaque",
 52 |   "functions",
 53 |   "constants",
 54 | ]
 55 | 
 56 | [layout]
 57 | packed = "false"                # No packed structs (better portability)
 58 | aligned_n = "0"                 # Natural alignment
 59 | 
 60 | [fn]
 61 | rename_args = "None"            # Keep original arg names
 62 | args = "auto"                   # Auto-detect by-value vs by-reference
 63 | must_use = "auto"               # Add [[nodiscard]] for important returns
 64 | no_return = "noreturn"          # Use noreturn attribute where applicable
 65 | 
 66 | [struct]
 67 | rename_fields = "None"          # Keep original field names
 68 | derive_constructor = false      # No C++ constructors
 69 | derive_eq = false               # No operator overloads
 70 | derive_neq = false
 71 | derive_lt = false
 72 | derive_lte = false
 73 | derive_gt = false
 74 | derive_gte = false
 75 | 
 76 | [enum]
 77 | rename_variants = "None"        # Keep original variant names
 78 | enum_class = false              # C-style enums (not enum class)
 79 | prefix_with_name = false        # No enum name prefix on variants
 80 | 
 81 | [const]
 82 | allow_static_const = true       # Allow static const declarations
 83 | allow_constexpr = false         # No constexpr (C++ feature)
 84 | 
 85 | [macro_expansion]
 86 | bitflags = false                # Don't expand bitflags macros
 87 | 
 88 | [parse]
 89 | parse_deps = false              # Don't parse dependencies
 90 | include = []
 91 | exclude = []
 92 | clean = false                   # Don't remove items
 93 | extra_bindings = []
 94 | 
 95 | [parse.expand]
 96 | crates = []
 97 | all_features = false
 98 | default_features = true
 99 | features = []
100 | 


--------------------------------------------------------------------------------
/ffi/build.rs:
--------------------------------------------------------------------------------
  1 | use std::env;
  2 | use std::fs;
  3 | use std::path::PathBuf;
  4 | 
  5 | fn main() {
  6 |     // Skip during docs builds
  7 |     if env::var("DOCS_RS").is_ok() {
  8 |         return;
  9 |     }
 10 | 
 11 |     let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
 12 |     let target = env::var("TARGET").unwrap();
 13 |     let profile = env::var("PROFILE").unwrap();
 14 |     
 15 |     println!("cargo:warning=Building extractous-ffi for target: {}", target);
 16 |     println!("cargo:warning=Profile: {}", profile);
 17 | 
 18 |     // 1. Generate C header
 19 |     generate_header(&manifest_dir);
 20 | 
 21 |     // 2. Configure RPATH for runtime library discovery
 22 |     configure_rpath(&target);
 23 | 
 24 |     // 3. Ensure extractous dependency built libraries are discoverable
 25 |     setup_extractous_libs(&target, &profile);
 26 | 
 27 |     // 4. Configure rerun triggers
 28 |     configure_rerun_triggers();
 29 | }
 30 | 
 31 | fn generate_header(crate_dir: &str) {
 32 |     let root_dir = PathBuf::from(crate_dir).parent().unwrap().to_path_buf();
 33 |     let header_path = root_dir.join("extractous.h");
 34 | 
 35 |     match cbindgen::Builder::new()
 36 |         .with_crate(crate_dir)
 37 |         .with_config(
 38 |             cbindgen::Config::from_file("cbindgen.toml")
 39 |                 .unwrap_or_else(|_| cbindgen::Config::default()),
 40 |         )
 41 |         .generate()
 42 |     {
 43 |         Ok(bindings) => {
 44 |             bindings.write_to_file(&header_path);
 45 |             println!("cargo:warning=Generated C header: {}", header_path.display());
 46 |         }
 47 |         Err(e) => {
 48 |             println!("cargo:warning=Failed to generate header: {:?}", e);
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | fn configure_rpath(target: &str) {
 54 |     if target.contains("linux") {
 55 |         // Use $ORIGIN for relocatable libraries
 56 |         println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
 57 |         println!("cargo:rustc-link-arg=-Wl,-z,origin");
 58 |         println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
 59 |         println!("cargo:warning=Configured Linux RPATH with $ORIGIN");
 60 |     } else if target.contains("darwin") || target.contains("macos") {
 61 |         // Use @loader_path for macOS
 62 |         println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
 63 |         println!("cargo:rustc-link-arg=-Wl,-install_name,@rpath/libextractous_ffi.dylib");
 64 |         println!("cargo:warning=Configured macOS RPATH with @loader_path");
 65 |     } else if target.contains("windows") {
 66 |         println!("cargo:warning=Windows: Using default DLL search path");
 67 |     }
 68 | }
 69 | 
 70 | fn setup_extractous_libs(_target: &str, _profile: &str) {
 71 |     // The extractous crate builds libtika_native via its build.rs
 72 |     // We need to ensure those libraries are found during linking
 73 |     
 74 |     let out_dir = env::var("OUT_DIR").unwrap();
 75 |     let target_dir = PathBuf::from(&out_dir)
 76 |         .parent().unwrap()
 77 |         .parent().unwrap()
 78 |         .parent().unwrap()
 79 |         .to_path_buf();
 80 |     
 81 |     // Search for extractous build output
 82 |     let build_dir = target_dir.join("build");
 83 |     
 84 |     if let Ok(entries) = fs::read_dir(&build_dir) {
 85 |         for entry in entries.flatten() {
 86 |             let path = entry.path();
 87 |             if let Some(name) = path.file_name() {
 88 |                 if name.to_str().unwrap().starts_with("extractous-") {
 89 |                     let libs_dir = path.join("out").join("libs");
 90 |                     if libs_dir.exists() {
 91 |                         println!("cargo:rustc-link-search={}", libs_dir.display());
 92 |                         println!("cargo:warning=Found extractous libs: {}", libs_dir.display());
 93 |                     }
 94 |                 }
 95 |             }
 96 |         }
 97 |     }
 98 | }
 99 | 
100 | fn configure_rerun_triggers() {
101 |     println!("cargo:rerun-if-changed=src");
102 |     println!("cargo:rerun-if-changed=build.rs");
103 |     println!("cargo:rerun-if-changed=cbindgen.toml");
104 |     println!("cargo:rerun-if-changed=Cargo.toml");
105 | }
106 | 


--------------------------------------------------------------------------------
/ffi/examples/README.md:
--------------------------------------------------------------------------------
  1 | # 📦 Extractous FFI Examples
  2 | 
  3 | This directory contains **C examples** demonstrating how to use the **Extractous FFI** library for text and metadata extraction from various document formats.
  4 | 
  5 | ---
  6 | 
  7 | ## Examples
  8 | 
  9 | | Example | Description |
 10 | |----------|-------------|
 11 | | **`basic.c`** | Simple file extraction with metadata |
 12 | | **`streaming.c`** | Stream large files without loading into memory |
 13 | | **`ocr.c`** | Extract scanned PDFs using OCR |
 14 | 
 15 | ---
 16 | 
 17 | ## Running Examples
 18 | 
 19 | ```bash
 20 | # Basic extraction — extracts text and metadata from any supported document format
 21 | ./basic document.pdf
 22 | 
 23 | # Streaming extraction — streams content from large files (>50MB)
 24 | ./streaming large_document.pdf > output.txt
 25 | 
 26 | # OCR extraction — extracts text from scanned PDFs using Tesseract OCR
 27 | ./ocr scanned_document.pdf
 28 | ````
 29 | 
 30 | ---
 31 | 
 32 | ## Requirements
 33 | 
 34 | **Tesseract OCR** must be installed for OCR examples to work.
 35 | 
 36 | ### Ubuntu / Debian
 37 | 
 38 | ```bash
 39 | sudo apt install tesseract-ocr tesseract-ocr-eng
 40 | ```
 41 | 
 42 | ### macOS
 43 | 
 44 | ```bash
 45 | brew install tesseract
 46 | ```
 47 | 
 48 | ### Windows
 49 | 
 50 | Download from the official repository: [https://github.com/UB-Mannheim/tesseract/wiki](https://github.com/UB-Mannheim/tesseract/wiki)
 51 | 
 52 | ---
 53 | 
 54 | ## Error Handling
 55 | 
 56 | All examples demonstrate robust error handling:
 57 | 
 58 | ```c
 59 | int err = extractous_extractor_extract_file_to_string(...);
 60 | if (err != ERR_OK) {
 61 |     char* msg = extractous_error_message(err);
 62 |     fprintf(stderr, "Error: %s\n", msg);
 63 |     extractous_string_free(msg);
 64 | }
 65 | ```
 66 | 
 67 | ---
 68 | 
 69 | ## Memory Management
 70 | 
 71 | Each example ensures proper cleanup of allocated resources:
 72 | 
 73 | ```c
 74 | // Extract
 75 | extractous_extractor_extract_file_to_string(ex, path, &content, &meta);
 76 | 
 77 | // Use content and metadata
 78 | printf("%s\n", content);
 79 | 
 80 | // Cleanup
 81 | extractous_string_free(content);
 82 | extractous_metadata_free(meta);
 83 | extractous_extractor_free(ex);
 84 | ```
 85 | 
 86 | ---
 87 | 
 88 | ## Common Issues
 89 | 
 90 | ### Library Not Found
 91 | 
 92 | If you see:
 93 | 
 94 | ```
 95 | error while loading shared libraries
 96 | ```
 97 | 
 98 | Set the library path manually:
 99 | 
100 | **Linux**
101 | 
102 | ```bash
103 | export LD_LIBRARY_PATH=../target/release:$LD_LIBRARY_PATH
104 | ./basic_extraction document.pdf
105 | ```
106 | 
107 | **macOS**
108 | 
109 | ```bash
110 | export DYLD_LIBRARY_PATH=../target/release:$DYLD_LIBRARY_PATH
111 | ./basic_extraction document.pdf
112 | ```
113 | 
114 | ---
115 | 
116 | ### OCR Not Available
117 | 
118 | If OCR examples fail with `ERR_OCR_NOT_AVAILABLE`:
119 | 
120 | 1. **Install Tesseract:**
121 | 
122 |    ```bash
123 |    # Ubuntu/Debian
124 |    sudo apt install tesseract-ocr tesseract-ocr-eng
125 | 
126 |    # macOS
127 |    brew install tesseract
128 |    ```
129 | 
130 | 2. **Verify installation:**
131 | 
132 |    ```bash
133 |    tesseract --version
134 |    ```
135 | 
136 | ---
137 | 
138 | ## Rough skeleton
139 | 
140 | ```c
141 | #include "../include/extractous.h"
142 | #include <stdio.h>
143 | 
144 | int main(int argc, char** argv) {
145 |     // 1. Create extractor
146 |     CExtractor* ex = extractous_extractor_new();
147 |     
148 |     // 2. Configure (optional)
149 |     ex = extractous_extractor_set_xml_output(ex, false);
150 |     
151 |     // 3. Extract
152 |     char* content = NULL;
153 |     CMetadata* meta = NULL;
154 |     int err = extractous_extractor_extract_file_to_string(
155 |         ex, "file.pdf", &content, &meta
156 |     );
157 |     
158 |     // 4. Check error
159 |     if (err != ERR_OK) {
160 |         char* msg = extractous_error_message(err);
161 |         fprintf(stderr, "Error: %s\n", msg);
162 |         extractous_string_free(msg);
163 |         extractous_extractor_free(ex);
164 |         return 1;
165 |     }
166 |     
167 |     // 5. Use results
168 |     printf("%s\n", content);
169 |     
170 |     // 6. Cleanup
171 |     extractous_string_free(content);
172 |     extractous_metadata_free(meta);
173 |     extractous_extractor_free(ex);
174 |     
175 |     return 0;
176 | }
177 | ```
178 | 


--------------------------------------------------------------------------------
/ffi/src/stream.rs:
--------------------------------------------------------------------------------
  1 | use crate::ecore::StreamReader as CoreStreamReader;
  2 | use crate::errors::*;
  3 | use crate::types::*;
  4 | use std::io::Read;
  5 | 
  6 | /// Reads data from a stream into a user-provided buffer.
  7 | ///
  8 | /// Returns the actual number of bytes read via the `bytes_read` output parameter.
  9 | /// Reaching the end of the stream is indicated by `ERR_OK` and `*bytes_read == 0`.
 10 | #[unsafe(no_mangle)]
 11 | pub unsafe extern "C" fn extractous_stream_read(
 12 |     handle: *mut CStreamReader,
 13 |     buffer: *mut u8,
 14 |     buffer_size: libc::size_t,
 15 |     bytes_read: *mut libc::size_t,
 16 | ) -> libc::c_int {
 17 |     if handle.is_null() || buffer.is_null() {
 18 |         return ERR_NULL_POINTER;
 19 |     }
 20 |     if !bytes_read.is_null() {
 21 |         unsafe { *bytes_read = 0 };
 22 |     }
 23 |     if buffer_size == 0 {
 24 |         return ERR_OK;
 25 |     }
 26 | 
 27 |     let reader = unsafe { &mut *(handle as *mut CoreStreamReader) };
 28 |     let buf_slice = unsafe { std::slice::from_raw_parts_mut(buffer, buffer_size) };
 29 | 
 30 |     match reader.read(buf_slice) {
 31 |         Ok(n) => {
 32 |             if !bytes_read.is_null() {
 33 |                 unsafe { *bytes_read = n };
 34 |             }
 35 |             ERR_OK
 36 |         }
 37 |         Err(_) => ERR_IO_ERROR,
 38 |     }
 39 | }
 40 | 
 41 | /// Reads exactly `buffer_size` bytes from the stream.
 42 | ///
 43 | /// Function will continue reading until the buffer is full, or the end of
 44 | /// the stream is reached, or an error occurs.
 45 | #[unsafe(no_mangle)]
 46 | pub unsafe extern "C" fn extractous_stream_read_exact(
 47 |     handle: *mut CStreamReader,
 48 |     buffer: *mut u8,
 49 |     buffer_size: libc::size_t,
 50 |     bytes_read: *mut libc::size_t,
 51 | ) -> libc::c_int {
 52 |     if handle.is_null() || buffer.is_null() || bytes_read.is_null() {
 53 |         return ERR_NULL_POINTER;
 54 |     }
 55 |     if buffer_size == 0 {
 56 |         return ERR_OK;
 57 |     }
 58 | 
 59 |     unsafe { *bytes_read = 0 };
 60 | 
 61 |     let reader = unsafe { &mut *(handle as *mut CoreStreamReader) };
 62 |     // slice representing the user-provided buffer
 63 |     let total_buf_slice = unsafe { std::slice::from_raw_parts_mut(buffer, buffer_size) };
 64 | 
 65 |     let mut total_bytes_read = 0;
 66 |     while total_bytes_read < buffer_size {
 67 |         // In each loop, we try to read into the remaining part of the buffer
 68 |         let remaining_buf = &mut total_buf_slice[total_bytes_read..];
 69 | 
 70 |         match reader.read(remaining_buf) {
 71 |             Ok(0) => {
 72 |                 // `read` returned 0, which signifies the end of the stream
 73 |                 // We break the loop and will return the total bytes we've read
 74 |                 break;
 75 |             }
 76 |             Ok(n) => {
 77 |                 total_bytes_read += n;
 78 |             }
 79 |             Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {
 80 |                 // The read was interrupted by a signal. This is recoverable so we just continue
 81 |                 continue;
 82 |             }
 83 |             Err(_) => {
 84 |                 // A non-recoverable I/O error occurred.
 85 |                 return ERR_IO_ERROR;
 86 |             }
 87 |         }
 88 |     }
 89 | 
 90 |     unsafe { *bytes_read = total_bytes_read };
 91 |     ERR_OK
 92 | }
 93 | 
 94 | /// Reads the remaining stream into a newly allocated buffer.
 95 | // #[must_use]
 96 | #[unsafe(no_mangle)]
 97 | pub unsafe extern "C" fn extractous_stream_read_all(
 98 |     handle: *mut CStreamReader,
 99 |     out_buffer: *mut *mut u8,
100 |     out_size: *mut libc::size_t,
101 | ) -> libc::c_int {
102 |     if handle.is_null() || out_buffer.is_null() || out_size.is_null() {
103 |         return ERR_NULL_POINTER;
104 |     }
105 | 
106 |     let reader = unsafe { &mut *(handle as *mut CoreStreamReader) };
107 |     let mut data_vec = Vec::new();
108 | 
109 |     match reader.read_to_end(&mut data_vec) {
110 |         Ok(_) => {
111 |             data_vec.shrink_to_fit();
112 | 
113 |             let size = data_vec.len();
114 |             let ptr = data_vec.as_mut_ptr();
115 |             std::mem::forget(data_vec);
116 | 
117 |             unsafe { *out_buffer = ptr };
118 |             unsafe { *out_size = size };
119 |             ERR_OK
120 |         }
121 |         Err(_) => ERR_IO_ERROR,
122 |     }
123 | }
124 | 
125 | /// Frees a buffer allocated by `extractous_stream_read_all`.
126 | #[unsafe(no_mangle)]
127 | pub unsafe extern "C" fn extractous_buffer_free(buffer: *mut u8, size: libc::size_t) {
128 |     if buffer.is_null() || size == 0 {
129 |         return;
130 |     }
131 |     let _ = unsafe { Vec::from_raw_parts(buffer, size, size) };
132 | }
133 | 
134 | /// Frees a stream reader and releases its resources.
135 | #[unsafe(no_mangle)]
136 | pub unsafe extern "C" fn extractous_stream_free(handle: *mut CStreamReader) {
137 |     if !handle.is_null() {
138 |         // Reconstruct the Box and let Rust's drop handler deallocate it.
139 |         let _ = unsafe { Box::from_raw(handle as *mut CoreStreamReader) };
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/ffi/src/errors.rs:
--------------------------------------------------------------------------------
  1 | use crate::ecore::Error;
  2 | use std::cell::RefCell;
  3 | use std::error::Error as StdError;
  4 | use std::ffi::CString;
  5 | use std::os::raw::{c_char, c_int};
  6 | use std::ptr;
  7 | 
  8 | pub const ERR_OK: c_int = 0;
  9 | pub const ERR_NULL_POINTER: c_int = -1;
 10 | pub const ERR_INVALID_UTF8: c_int = -2;
 11 | pub const ERR_INVALID_STRING: c_int = -3;
 12 | pub const ERR_EXTRACTION_FAILED: c_int = -4;
 13 | pub const ERR_IO_ERROR: c_int = -5;
 14 | pub const ERR_INVALID_CONFIG: c_int = -6;
 15 | pub const ERR_INVALID_ENUM: c_int = -7;
 16 | pub const ERR_UNSUPPORTED_FORMAT: c_int = -8;
 17 | pub const ERR_OUT_OF_MEMORY: c_int = -9;
 18 | pub const ERR_OCR_FAILED: c_int = -10;
 19 | 
 20 | pub(crate) fn extractous_error_to_code(err: &Error) -> c_int {
 21 |     match err {
 22 |         Error::IoError(_) => ERR_IO_ERROR,
 23 |         Error::Utf8Error(_) => ERR_INVALID_UTF8,
 24 | 
 25 |         // For unknown errors, inspect the message content
 26 |         Error::ParseError(msg) | Error::Unknown(msg) => {
 27 |             let lower_msg = msg.to_lowercase();
 28 |             if lower_msg.contains("ocr") {
 29 |                 ERR_OCR_FAILED
 30 |             } else if lower_msg.contains("unsupported") {
 31 |                 ERR_UNSUPPORTED_FORMAT
 32 |             } else if lower_msg.contains("config") {
 33 |                 ERR_INVALID_CONFIG
 34 |             } else {
 35 |                 // Default to general extraction failure
 36 |                 ERR_EXTRACTION_FAILED
 37 |             }
 38 |         }
 39 | 
 40 |         Error::JniError(jni_err) => {
 41 |             let error_string = jni_err.to_string();
 42 |             let lower_error_string = error_string.to_lowercase();
 43 | 
 44 |             if lower_error_string.contains("javaexception") {
 45 |                 // This string appears when the error is due to a Java-side exception,
 46 |                 // which is the case your `jnicallmethodlocal` handles. This is a strong
 47 |                 // indicator of a failure within Tika's processing.
 48 |                 ERR_EXTRACTION_FAILED
 49 |             } else if lower_error_string.contains("nomemory") {
 50 |                 ERR_OUT_OF_MEMORY
 51 |             } else {
 52 |                 ERR_EXTRACTION_FAILED
 53 |             }
 54 |         }
 55 | 
 56 |         Error::JniEnvCall(_) => ERR_EXTRACTION_FAILED,
 57 |     }
 58 | }
 59 | 
 60 | #[unsafe(no_mangle)]
 61 | pub extern "C" fn extractous_error_message(code: c_int) -> *mut c_char {
 62 |     let msg = match code {
 63 |         ERR_OK => "Operation completed successfully",
 64 |         ERR_NULL_POINTER => "Null pointer provided as argument",
 65 |         ERR_INVALID_UTF8 => "Invalid UTF-8 string encoding",
 66 |         ERR_INVALID_STRING => "String conversion or allocation failed",
 67 |         ERR_EXTRACTION_FAILED => "Document extraction failed",
 68 |         ERR_IO_ERROR => "File system or network I/O error",
 69 |         ERR_INVALID_CONFIG => "Invalid configuration value",
 70 |         ERR_INVALID_ENUM => "Invalid enumeration value",
 71 |         ERR_UNSUPPORTED_FORMAT => "Unsupported file format",
 72 |         ERR_OUT_OF_MEMORY => "Memory allocation failed",
 73 |         ERR_OCR_FAILED => "OCR operation failed",
 74 |         _ => "Unknown error code",
 75 |     };
 76 |     match CString::new(msg) {
 77 |         Ok(s) => s.into_raw(),
 78 |         Err(_) => ptr::null_mut(),
 79 |     }
 80 | }
 81 | 
 82 | thread_local! {
 83 |     /// Stores the last detailed error that occurred on the current thread
 84 |     static LAST_ERROR: RefCell<Option<Box<dyn StdError + Send>>> = RefCell::new(None);
 85 | }
 86 | 
 87 | pub(crate) fn set_last_error(err: impl StdError + Send + 'static) {
 88 |     LAST_ERROR.with(|cell| {
 89 |         *cell.borrow_mut() = Some(Box::new(err));
 90 |     });
 91 | }
 92 | 
 93 | /// Retrieves a detailed debug report for the last error on this thread
 94 | /// full error chain and a backtrace if RUST_BACKTRACE=1
 95 | #[unsafe(no_mangle)]
 96 | pub extern "C" fn extractous_error_get_last_debug() -> *mut c_char {
 97 |     LAST_ERROR.with(|cell| {
 98 |         if let Some(err) = cell.borrow_mut().take() {
 99 |             let mut debug_output = format!("Error: {}", err);
100 |             let mut source = err.source();
101 |             if source.is_some() {
102 |                 debug_output.push_str("\n\nCaused by:");
103 |             }
104 |             let mut level = 0;
105 |             while let Some(cause) = source {
106 |                 debug_output.push_str(&format!("\n    {}: {}", level, cause));
107 |                 source = cause.source();
108 |                 level += 1;
109 |             }
110 |             debug_output.push_str(&format!("\n\nDebug Representation:\n{:?}", err));
111 |             match CString::new(debug_output) {
112 |                 Ok(s) => s.into_raw(),
113 |                 Err(_) => ptr::null_mut(),
114 |             }
115 |         } else {
116 |             ptr::null_mut()
117 |         }
118 |     })
119 | }
120 | 
121 | /// Checks if debug information is available for the current thread
122 | #[unsafe(no_mangle)]
123 | pub extern "C" fn extractous_error_has_debug() -> c_int {
124 |     LAST_ERROR.with(|cell| if cell.borrow().is_some() { 1 } else { 0 })
125 | }
126 | 
127 | #[unsafe(no_mangle)]
128 | pub extern "C" fn extractous_error_clear_last() {
129 |     LAST_ERROR.with(|cell| {
130 |         *cell.borrow_mut() = None;
131 |     });
132 | }
133 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
  1 | # Extractous-Go Test Suite
  2 | 
  3 | This directory contains comprehensive tests for the extractous-go library at multiple levels.
  4 | 
  5 | ## Test Structure
  6 | 
  7 | ```
  8 | tests/
  9 | ├── ffi/                    # FFI layer tests (C interface)
 10 | │   ├── test_ffi_interface.c
 11 | │   └── Makefile
 12 | ├── go/                     # Go binding tests
 13 | │   ├── bindings_test.go    # Unit tests for Go API
 14 | │   └── integration_test.go # Integration tests with actual files
 15 | └── testdata/               # Test files (created at runtime)
 16 | ```
 17 | 
 18 | ## Running Tests
 19 | 
 20 | ### 1. FFI Layer Tests (C)
 21 | 
 22 | First, ensure the Rust FFI library is built:
 23 | 
 24 | ```bash
 25 | cd ffi
 26 | cargo build --release
 27 | cd ..
 28 | ```
 29 | 
 30 | Then run the FFI tests:
 31 | 
 32 | ```bash
 33 | cd tests/ffi
 34 | make run
 35 | ```
 36 | 
 37 | Or run individual test categories:
 38 | 
 39 | ```bash
 40 | make run          # Run all tests
 41 | make clean        # Clean build artifacts
 42 | ```
 43 | 
 44 | **What FFI tests validate:**
 45 | - Extractor lifecycle (new, free, double-free safety)
 46 | - Configuration functions (max length, encoding, XML output)
 47 | - PDF/Office/OCR configuration
 48 | - Error handling and null pointer safety
 49 | - URL extraction
 50 | - Memory management
 51 | 
 52 | ### 2. Go Binding Tests
 53 | 
 54 | Run all Go tests:
 55 | 
 56 | ```bash
 57 | cd tests/go
 58 | go test -v
 59 | ```
 60 | 
 61 | Run specific test files:
 62 | 
 63 | ```bash
 64 | go test -v -run TestExtractor    # Run extractor tests
 65 | go test -v -run TestPdfConfig    # Run PDF config tests
 66 | go test -v -run TestIntegration  # Run integration tests
 67 | ```
 68 | 
 69 | Run with race detection:
 70 | 
 71 | ```bash
 72 | go test -race -v
 73 | ```
 74 | 
 75 | **What Go binding tests validate:**
 76 | 
 77 | #### `bindings_test.go` - Unit Tests
 78 | - Extractor lifecycle and nil-safety
 79 | - Configuration methods (max length, encoding, XML output)
 80 | - PDF/Office/OCR configuration
 81 | - Builder pattern and method chaining
 82 | - Error handling for nil extractors
 83 | - Metadata API (Get, GetAll, Has, Keys)
 84 | - CharSet constants
 85 | 
 86 | #### `integration_test.go` - Integration Tests
 87 | - Plain text file extraction
 88 | - Byte array extraction (string and stream)
 89 | - Configuration effects (max length, encoding, XML output)
 90 | - Metadata extraction and parsing
 91 | - Error handling (nonexistent files, empty files)
 92 | - Concurrent extraction (multiple goroutines)
 93 | - Multiple extractors on same file
 94 | 
 95 | ## Test Data
 96 | 
 97 | Integration tests create temporary test files in `tests/testdata/` directory. These files are:
 98 | - Created at test runtime
 99 | - Cleaned up after each test
100 | - Simple text files for validation
101 | 
102 | ## Memory Management
103 | 
104 | **Important:** All config objects (PdfConfig, OfficeConfig, OcrConfig) use Go finalizers for automatic cleanup. You should **NOT** call any `Free()` method manually in Go code - they don't exist in the public API.
105 | 
106 | The FFI layer tests validate that the underlying C functions properly manage memory.
107 | 
108 | ## Prerequisites
109 | 
110 | ### For FFI Tests:
111 | - GCC or compatible C compiler
112 | - libextractous_ffi.so (built from Rust FFI layer)
113 | - extractous.h header file
114 | 
115 | ### For Go Tests:
116 | - Go 1.25.1 or later
117 | - CGo enabled
118 | - libextractous_ffi.so in library path or proper LD_LIBRARY_PATH
119 | 
120 | ## Troubleshooting
121 | 
122 | ### FFI Tests
123 | 
124 | **Error: `libextractous_ffi.so: cannot open shared object file`**
125 | ```bash
126 | # Ensure the library is built and in the right location
127 | cd ffi && cargo build --release
128 | # Check native/ directory for the compiled library
129 | ls -la native/*/
130 | ```
131 | 
132 | **Error: `extractous.h: No such file or directory`**
133 | ```bash
134 | # Regenerate the header with cbindgen
135 | cd ffi
136 | cbindgen --config cbindgen.toml --crate extractous-ffi --output ../include/extractous.h
137 | ```
138 | 
139 | ### Go Tests
140 | 
141 | **Error: `undefined reference to extractous_*`**
142 | - Ensure the FFI library is built: `cd ffi && cargo build --release`
143 | - Check that CGo can find the library (see `src/cgo.go` for paths)
144 | 
145 | **Error: Package import issues**
146 | - Ensure you're running tests from the `tests/go/` directory
147 | - Module path should be `extractous-go` (check `go.mod`)
148 | 
149 | **Segmentation fault**
150 | - This usually indicates a problem at the FFI boundary
151 | - Run FFI tests first to validate the C interface
152 | - Check that all CGo calls handle nil pointers correctly
153 | 
154 | ## Continuous Integration
155 | 
156 | For CI pipelines, run tests in this order:
157 | 
158 | ```bash
159 | # 1. Build FFI library
160 | cd ffi && cargo build --release && cd ..
161 | 
162 | # 2. Run FFI tests
163 | cd tests/ffi && make run && cd ../..
164 | 
165 | # 3. Run Go tests
166 | cd tests/go && go test -v -race && cd ../..
167 | ```
168 | 
169 | ## Test Coverage
170 | 
171 | To generate coverage reports for Go tests:
172 | 
173 | ```bash
174 | cd tests/go
175 | go test -coverprofile=coverage.out
176 | go tool cover -html=coverage.out
177 | ```
178 | 
179 | ## Contributing
180 | 
181 | When adding new features:
182 | 
183 | 1. **Add FFI tests first** - Validate the C interface in `tests/ffi/test_ffi_interface.c`
184 | 2. **Add Go unit tests** - Test the Go wrapper in `tests/go/bindings_test.go`
185 | 3. **Add integration tests** - Test end-to-end functionality in `tests/go/integration_test.go`
186 | 
187 | This ensures full validation from the C boundary up through the Go API.
188 | 


--------------------------------------------------------------------------------
/ffi/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! This crate provides a **C-compatible Foreign Function Interface (FFI)** for the
  2 | //! Extractous library. Extractous is a fast and efficient solution for extracting
  3 | //! content and metadata from various document formats including PDF, Word, Excel, and more.
  4 | //!
  5 | //! This FFI layer is meticulously designed for safety and performance, featuring:
  6 | //! - Opaque pointers to prevent unsafe access to internal data structures.
  7 | //! - A robust, thread-safe error handling mechanism with on-demand debug info.
  8 | //! - A clear memory ownership model with explicit `_new` and `_free` functions.
  9 | //!
 10 | //! ## Quick Start
 11 | //!
 12 | //! ```
 13 | //! // 1. Create an extractor instance.
 14 | //! CExtractor* extractor = extractous_extractor_new();
 15 | //!
 16 | //! // 2. Configure the extractor. Setters modify the object in-place.
 17 | //! //    DO NOT re-assign the pointer.
 18 | //! extractous_extractor_set_xml_output(extractor, true);
 19 | //!
 20 | //! // 3. Extract content and metadata from a file.
 21 | //! char* content = NULL;
 22 | //! CMetadata* metadata = NULL;
 23 | //! int result = extractous_extractor_extract_file_to_string(
 24 | //!     extractor, "document.pdf", &content, &metadata
 25 | //! );
 26 | //!
 27 | //! // 4. Check for errors and handle them.
 28 | //! if (result != ERR_OK) {
 29 | //!     // Handle the error (see Error Handling section).
 30 | //!     fprintf(stderr, "Extraction failed with code: %d\n", result);
 31 | //! } else {
 32 | //!     // 5. Use the results.
 33 | //!     printf("Content: %s\n", content);
 34 | //! }
 35 | //!
 36 | //! // 6. Clean up all allocated resources in reverse order.
 37 | //! extractous_string_free(content);
 38 | //! extractous_metadata_free(metadata);
 39 | //! extractous_extractor_free(extractor);
 40 | //! ```
 41 | //!
 42 | //! ## Thread Safety
 43 | //!
 44 | //! - **Extractor Instances**: `CExtractor` and its associated config/stream objects are
 45 | //!   **NOT thread-safe**. Do not share a handle across threads. The recommended pattern is
 46 | //!   to create one `CExtractor` instance per thread that needs it.
 47 | //! - **Error Handling**: The error reporting system **IS thread-safe**. Each thread stores
 48 | //!   its own last error information independently, preventing race conditions. You can safely
 49 | //!   call error-handling functions from any thread.
 50 | //!
 51 | //! # Advanced Error Handling
 52 | //!
 53 | //! This library uses a powerful two-tier error system for maximum performance and diagnostics.
 54 | //!
 55 | //! ### Tier 1: Fast Path (Error Codes)
 56 | //!
 57 | //! All FFI functions return an integer error code. `ERR_OK` (0) signifies success. This allows
 58 | //! for a very fast check without any overhead.
 59 | //!
 60 | //! ### Tier 2: Slow Path (On-Demand Detailed Info)
 61 | //!
 62 | //! When an error occurs, you can request more information on demand.
 63 | //!
 64 | //! **1. Get the Error Category:**
 65 | //! Use `extractous_error_category()` to get a stable, machine-readable string
 66 | //! representing the *type* of error. This is perfect for building idiomatic Go error wrappers.
 67 | //! The returned pointer is static and **must not be freed**.
 68 | //!
 69 | //! **2. Get a Simple Message:**
 70 | //! Use `extractous_error_message()` to get a simple, human-readable description.
 71 | //! The returned string **must be freed** with `extractous_string_free()`.
 72 | //!
 73 | //! **3. Get a Full Debug Report:**
 74 | //! If `extractous_error_has_debug()` returns `1`, you can call `extractous_error_get_last_debug()`
 75 | //! to get a detailed report, including the full error chain and a backtrace (if enabled with `RUST_BACKTRACE=1`).
 76 | //! The returned string **must be freed**.
 77 | //!
 78 | //! ### Go Usage Pattern
 79 | //!
 80 | //! ```
 81 | //! // (Inside a function that calls the FFI)
 82 | //! resultCode := C.some_extractous_function(...)
 83 | //! if resultCode != C.ERR_OK {
 84 | //!     // Get stable category for idiomatic error wrapping.
 85 | //!     category := C.GoString(C.extractous_error_category(resultCode))
 86 | //!
 87 | //!     // Get the simple message for the error string.
 88 | //!     msgCStr := C.extractous_error_message(resultCode)
 89 | //!     defer C.extractous_string_free(msgCStr)
 90 | //!     message := C.GoString(msgCStr)
 91 | //!
 92 | //!     var baseError error
 93 | //!     switch category {
 94 | //!     case "io_error": baseError = ErrIO
 95 | //!     default: baseError = ErrUnknown
 96 | //!     }
 97 | //!
 98 | //!     // Optionally log the full debug info for developers.
 99 | //!     if C.extractous_error_has_debug() != 0 {
100 | //!         debugCStr := C.extractous_error_get_last_debug()
101 | //!         defer C.extractous_string_free(debugCStr)
102 | //!         log.Printf("Full debug details: %s", C.GoString(debugCStr))
103 | //!     }
104 | //!
105 | //!     return fmt.Errorf("%w: %s", baseError, message)
106 | //! }
107 | //! ```
108 | #![warn(clippy::all)]
109 | #![allow(clippy::missing_safety_doc)]
110 | 
111 | // Re-export the core library under a consistent, private alias.
112 | pub use extractous as ecore;
113 | 
114 | // Module declarations.
115 | mod config;
116 | mod errors;
117 | mod extractor;
118 | mod metadata;
119 | mod stream;
120 | mod types;
121 | 
122 | // Publicly re-export all FFI-safe functions and types for C header generation.
123 | pub use config::*;
124 | pub use errors::*;
125 | pub use extractor::*;
126 | pub use metadata::*;
127 | pub use stream::*;
128 | pub use types::*;
129 | 
130 | /// Returns the FFI wrapper version as a null-terminated UTF-8 string.
131 | /// The returned pointer is to a static string and must not be freed.
132 | #[unsafe(no_mangle)]
133 | pub extern "C" fn extractous_ffi_version() -> *const libc::c_char {
134 |     // Use a static byte array with a null terminator for guaranteed memory safety.
135 |     static VERSION: &[u8] = concat!(env!("CARGO_PKG_VERSION"), "\0").as_bytes();
136 |     VERSION.as_ptr() as *const libc::c_char
137 | }
138 | 
139 | /// Returns the underlying Extractous core library version.
140 | /// The returned pointer is to a static string and must not be freed.
141 | #[unsafe(no_mangle)]
142 | pub extern "C" fn extractous_core_version() -> *const libc::c_char {
143 |     static VERSION: &[u8] = b"0.3.0\0";
144 |     VERSION.as_ptr() as *const libc::c_char
145 | }
146 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - 'v*.*.*'  # Trigger on semantic version tags (v1.0.0, v0.2.1, etc.)
  7 |   workflow_dispatch:
  8 |     inputs:
  9 |       tag_name:
 10 |         description: 'Release tag name (e.g., v1.0.0)'
 11 |         required: true
 12 |         type: string
 13 | 
 14 | # Prevent concurrent releases
 15 | concurrency:
 16 |   group: release-${{ github.ref }}
 17 |   cancel-in-progress: false
 18 | 
 19 | permissions:
 20 |   contents: write
 21 |   actions: read
 22 | 
 23 | jobs:
 24 |   # Build artifacts for all platforms
 25 |   build:
 26 |     name: Build Release Artifacts
 27 |     uses: ./.github/workflows/build.yml
 28 |     secrets: inherit
 29 | 
 30 |   # Create GitHub Release with built artifacts
 31 |   create-release:
 32 |     name: Create GitHub Release
 33 |     runs-on: ubuntu-latest
 34 |     needs: [build]
 35 |     
 36 |     steps:
 37 |       - name: Checkout repository
 38 |         uses: actions/checkout@v4
 39 |       
 40 |       - name: Determine tag name
 41 |         id: tag
 42 |         shell: bash
 43 |         run: |
 44 |           if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
 45 |             echo "tag_name=${{ github.event.inputs.tag_name }}" >> $GITHUB_OUTPUT
 46 |           else
 47 |             echo "tag_name=${{ github.ref_name }}" >> $GITHUB_OUTPUT
 48 |           fi
 49 |       
 50 |       - name: Download all artifacts
 51 |         uses: actions/download-artifact@v4
 52 |         with:
 53 |           path: artifacts
 54 |       
 55 |       - name: List downloaded artifacts
 56 |         shell: bash
 57 |         run: |
 58 |           echo "Downloaded artifacts:"
 59 |           ls -R artifacts
 60 |       
 61 |       - name: Package per-platform archives
 62 |         id: package
 63 |         shell: bash
 64 |         run: |
 65 |           set -e
 66 |           mkdir -p release
 67 |           
 68 |           echo "Creating per-platform release archives..."
 69 |           
 70 |           # Package each platform separately
 71 |           for platform_dir in artifacts/extractous-ffi-*; do
 72 |             if [ -d "$platform_dir" ]; then
 73 |               platform=$(basename "$platform_dir" | sed 's/extractous-ffi-//')
 74 |               
 75 |               echo "Packaging $platform..."
 76 |               
 77 |               # Determine archive extension based on platform
 78 |               if [[ "$platform" == windows_* ]]; then
 79 |                 archive_name="extractous-ffi-${platform}.zip"
 80 |                 (cd "$platform_dir" && zip -r "../../release/$archive_name" .)
 81 |               else
 82 |                 archive_name="extractous-ffi-${platform}.tar.gz"
 83 |                 tar -czf "release/$archive_name" -C "$platform_dir" .
 84 |               fi
 85 |               
 86 |               # Generate checksum
 87 |               (cd release && sha256sum "$archive_name" > "${archive_name}.sha256")
 88 |               
 89 |               echo "  ✓ Created $archive_name"
 90 |             fi
 91 |           done
 92 |           
 93 |           echo ""
 94 |           echo "Release assets:"
 95 |           ls -lh release/
 96 |       
 97 |       - name: Generate release notes
 98 |         id: release-notes
 99 |         shell: bash
100 |         run: |
101 |           TAG_NAME="${{ steps.tag.outputs.tag_name }}"
102 |           
103 |           cat > release_notes.md <<'EOF'
104 |           # Extractous Go FFI ${{ steps.tag.outputs.tag_name }}
105 |           
106 |           This release contains the native library binaries for extractous-go.
107 |           
108 |           ## Installation
109 |           
110 |           Use the installation command from your Go project:
111 |           
112 |           ```bash
113 |           go run github.com/rahulpoonia29/extractous-go/cmd/install@latest
114 |           ```
115 |           
116 |           The installer will automatically download the correct platform libraries for you.
117 |           
118 |           ## Available Platforms
119 |           
120 |           - **linux_amd64**: Linux x86_64
121 |           - **windows_amd64**: Windows x86_64
122 |           - **darwin_arm64**: macOS Apple Silicon (M1/M2)
123 |           
124 |           ## Manual Installation
125 |           
126 |           If you prefer to install manually:
127 |           
128 |           1. Download the archive for your platform
129 |           2. Extract it to `./native/{platform}/` in your project
130 |           3. Verify the checksum using the `.sha256` file
131 |           
132 |           ## Archive Contents
133 |           
134 |           Each archive contains:
135 |           - `include/extractous.h` - C header file
136 |           - `lib/` - Platform-specific shared libraries
137 |           
138 |           ## Build Information
139 |           
140 |           - Rust version: 1.90.0
141 |           - Built with GraalVM Native Image
142 |           - Build date: $(date -u +"%Y-%m-%d")
143 |           
144 |           ---
145 |           
146 |           **Note:** This is a draft release. Please review and edit these notes before publishing.
147 |           EOF
148 |           
149 |           cat release_notes.md
150 |       
151 |       - name: Create GitHub Release (Draft)
152 |         uses: softprops/action-gh-release@v2
153 |         with:
154 |           tag_name: ${{ steps.tag.outputs.tag_name }}
155 |           name: Release ${{ steps.tag.outputs.tag_name }}
156 |           body_path: release_notes.md
157 |           draft: true
158 |           prerelease: ${{ contains(steps.tag.outputs.tag_name, '-') }}
159 |           files: release/*
160 |           fail_on_unmatched_files: true
161 |         env:
162 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
163 |       
164 |       - name: Release summary
165 |         shell: bash
166 |         run: |
167 |           TAG_NAME="${{ steps.tag.outputs.tag_name }}"
168 |           
169 |           echo "## 📦 Draft Release Created: ${TAG_NAME}" >> $GITHUB_STEP_SUMMARY
170 |           echo "" >> $GITHUB_STEP_SUMMARY
171 |           echo "The release has been created as a **draft**. Please review it before publishing:" >> $GITHUB_STEP_SUMMARY
172 |           echo "" >> $GITHUB_STEP_SUMMARY
173 |           echo "https://github.com/${{ github.repository }}/releases/tag/${TAG_NAME}" >> $GITHUB_STEP_SUMMARY
174 |           echo "" >> $GITHUB_STEP_SUMMARY
175 |           echo "### Release Assets" >> $GITHUB_STEP_SUMMARY
176 |           echo "" >> $GITHUB_STEP_SUMMARY
177 |           
178 |           for file in release/*.{tar.gz,zip}; do
179 |             if [ -f "$file" ]; then
180 |               size=$(du -h "$file" | cut -f1)
181 |               filename=$(basename "$file")
182 |               echo "- \`$filename\` ($size)" >> $GITHUB_STEP_SUMMARY
183 |             fi
184 |           done
185 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/collect-libs.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -e
  3 | 
  4 | # Collect all native libraries for distribution
  5 | # Usage: ./scripts/collect-libs.sh <platform> <target> <lib_ext>
  6 | 
  7 | PLATFORM=$1 # e.g., linux_amd64
  8 | TARGET=$2   # e.g., x86_64-unknown-linux-gnu
  9 | LIB_EXT=$3  # e.g., so, dll, dylib
 10 | 
 11 | echo "=== Collecting Libraries ==="
 12 | echo "Platform: $PLATFORM | Target: $TARGET | Extension: $LIB_EXT"
 13 | 
 14 | # Create distribution structure
 15 | DIST_DIR="dist/$PLATFORM"
 16 | mkdir -p "$DIST_DIR"
 17 | # Determine OS-specific naming
 18 | if [[ "$PLATFORM" == *"windows"* ]]; then
 19 | 	OS="Windows"
 20 | 	MAIN_LIB_PREFIX=""
 21 | 	TIKA_LIB_PREFIX=""
 22 | elif [[ "$PLATFORM" == *"darwin"* ]]; then
 23 | 	OS="macOS"
 24 | 	MAIN_LIB_PREFIX="lib"
 25 | 	TIKA_LIB_PREFIX="lib"
 26 | else
 27 | 	OS="Linux"
 28 | 	MAIN_LIB_PREFIX="lib"
 29 | 	TIKA_LIB_PREFIX="lib"
 30 | fi
 31 | 
 32 | # 1. Find and copy main FFI library
 33 | RELEASE_DIR="ffi/target/$TARGET/release"
 34 | MAIN_LIB="$RELEASE_DIR/${MAIN_LIB_PREFIX}extractous_ffi.$LIB_EXT"
 35 | 
 36 | if [ ! -f "$MAIN_LIB" ]; then
 37 | 	echo "✗ Error: Main FFI library not found: $MAIN_LIB"
 38 | 	exit 1
 39 | fi
 40 | 
 41 | echo "✓ Found main FFI library: $MAIN_LIB"
 42 | cp "$MAIN_LIB" "$DIST_DIR"
 43 | 
 44 | # 2. Find extractous build output directory
 45 | # Look for the canonical libs directory created by extractous build.rs
 46 | echo ""
 47 | echo "Searching for extractous dependencies..."
 48 | 
 49 | BUILD_BASE="$RELEASE_DIR/build"
 50 | 
 51 | # Find all extractous-*/out/libs directories, sort by modification time (newest first)
 52 | LIBS_DIR=$(find "$BUILD_BASE" -type d -path "*/extractous-*/out/libs" -printf "%T@ %p\n" 2>/dev/null |
 53 | 	sort -rn |
 54 | 	head -1 |
 55 | 	cut -d' ' -f2)
 56 | 
 57 | # Fallback for macOS (no -printf support)
 58 | if [ -z "$LIBS_DIR" ]; then
 59 | 	# Find all matching directories
 60 | 	FOUND_DIRS=$(find "$BUILD_BASE" -type d -path "*/extractous-*/out/libs" 2>/dev/null)
 61 | 
 62 | 	if [ -n "$FOUND_DIRS" ]; then
 63 | 		# Get the most recently modified directory
 64 | 		LIBS_DIR=$(echo "$FOUND_DIRS" | while read -r dir; do
 65 | 			echo "$(stat -f "%m" "$dir") $dir"
 66 | 		done | sort -rn | head -1 | cut -d' ' -f2-)
 67 | 	fi
 68 | fi
 69 | 
 70 | if [ -z "$LIBS_DIR" ]; then
 71 | 	echo "✗ Error: Could not find extractous out/libs directory"
 72 | 	echo "Searched in: $BUILD_BASE/extractous-*/out/libs"
 73 | 	echo ""
 74 | 	echo "Available build directories:"
 75 | 	find "$BUILD_BASE" -type d -name "extractous-*" 2>/dev/null || echo "None found"
 76 | 	echo ""
 77 | 	echo "Checking for out directories:"
 78 | 	find "$BUILD_BASE" -type d -name "out" 2>/dev/null || echo "None found"
 79 | 	exit 1
 80 | fi
 81 | 
 82 | echo "✓ Found libs directory: $LIBS_DIR"
 83 | 
 84 | # 3. Verify libtika_native exists
 85 | # Try both with and without prefix for Windows compatibility
 86 | TIKA_LIB="$LIBS_DIR/${TIKA_LIB_PREFIX}tika_native.$LIB_EXT"
 87 | if [ ! -f "$TIKA_LIB" ] && [ "$OS" = "Windows" ]; then
 88 | 	# Try with lib prefix on Windows as fallback
 89 | 	TIKA_LIB_ALT="$LIBS_DIR/libtika_native.$LIB_EXT"
 90 | 	if [ -f "$TIKA_LIB_ALT" ]; then
 91 | 		TIKA_LIB="$TIKA_LIB_ALT"
 92 | 	fi
 93 | fi
 94 | 
 95 | if [ ! -f "$TIKA_LIB" ]; then
 96 | 	echo "✗ Error: tika_native.$LIB_EXT not found in $LIBS_DIR"
 97 | 	echo "Directory contents:"
 98 | 	ls -lh "$LIBS_DIR" || echo "Directory not accessible"
 99 | 
100 | 	# Show all DLL/SO/DYLIB files to help debug
101 | 	echo ""
102 | 	echo "All native libraries found:"
103 | 	find "$LIBS_DIR" -name "*.$LIB_EXT" -o -name "*.dll" -o -name "*.so" -o -name "*.dylib" 2>/dev/null || echo "None found"
104 | 	exit 1
105 | fi
106 | 
107 | echo "✓ Found libtika_native: $TIKA_LIB"
108 | 
109 | # 4. Copy ALL libraries from out/libs/
110 | # These are all required dependencies bundled by GraalVM
111 | echo ""
112 | echo "Copying all native dependencies..."
113 | cp "$LIBS_DIR"/*."$LIB_EXT" "$DIST_DIR" || {
114 | 	echo "✗ Error: Failed to copy libraries"
115 | 	exit 1
116 | }
117 | 
118 | # 5. Patch libextractous_ffi on macOS to use @rpath and replace absolute path
119 | # https://github.com/rahulpoonia29/extractous-go/issues/5
120 | if [ "$OS" = "macOS" ]; then
121 | 	echo ""
122 | 	echo "Verify XCode tools"    
123 | 	# XCode tools are present on github macOS runners by default, but verify anyway
124 | 	which otool || { echo "✗ otool not found"; exit 1; }
125 | 	which install_name_tool || { echo "✗ install_name_tool not found"; exit 1; }
126 | 	otool -L "$DIST_DIR/libextractous_ffi.dylib" || { echo "✗ otool test failed"; exit 1; }
127 | 
128 | 	echo "Patching libextractous_ffi.dylib to use @loader_path for tika"
129 | 	OLD_PATH=$(otool -L "$DIST_DIR/libextractous_ffi.dylib" | grep libtika_native.dylib | awk '{print $1}')
130 | 	echo "  Old tika_native path: $OLD_PATH"
131 | 
132 | 	# Replace with @loader_path (directory of the main library)
133 | 	install_name_tool -change "$OLD_PATH" "@loader_path/libtika_native.dylib" "$DIST_DIR/libextractous_ffi.dylib"
134 | 
135 | 	echo "Debug: New Path"
136 | 	echo "!! Should be @loader_path/libtika_native.dylib"
137 | 	otool -L "$DIST_DIR/libextractous_ffi.dylib" | sed 's/^/    /'
138 | fi
139 | 
140 | # Count copied libraries
141 | LIB_COUNT=$(find "$DIST_DIR" -name "*.$LIB_EXT" | wc -l)
142 | echo "✓ Copied $LIB_COUNT libraries"
143 | 
144 | # 5. Display distribution contents
145 | echo ""
146 | echo "=== Distribution Contents ==="
147 | echo "Libraries ($LIB_COUNT total):"
148 | ls -lh "$DIST_DIR" | tail -n +2
149 | 
150 | if [ -d "$DIST_DIR/include" ] && [ -n "$(ls -A "$DIST_DIR/include" 2>/dev/null)" ]; then
151 | 	echo ""
152 | 	echo "Headers:"
153 | 	ls -lh "$DIST_DIR/include/"
154 | fi
155 | 
156 | # 7. Verify dependencies (platform-specific)
157 | echo ""
158 | echo "=== Dependency Verification ==="
159 | 
160 | if [ "$OS" = "Linux" ]; then
161 | 	echo "Checking RPATH configuration..."
162 | 	for lib in "$DIST_DIR"*.$LIB_EXT; do
163 | 		LIB_NAME=$(basename "$lib")
164 | 		echo "  $LIB_NAME:"
165 | 		readelf -d "$lib" | grep -E "RPATH|RUNPATH" | sed 's/^/    /' || echo "    No RPATH set"
166 | 	done
167 | 
168 | 	echo ""
169 | 	echo "Checking main FFI dependencies..."
170 | 	ldd "$DIST_DIR/${MAIN_LIB_PREFIX}extractous_ffi.$LIB_EXT" || true
171 | 
172 | elif [ "$OS" = "macOS" ]; then
173 | 	echo "Checking install names..."
174 | 	for lib in "$DIST_DIR"*.$LIB_EXT; do
175 | 		LIB_NAME=$(basename "$lib")
176 | 		echo "  $LIB_NAME:"
177 | 		otool -L "$lib" | grep -v "$LIB_NAME:" | sed 's/^/    /'
178 | 	done
179 | 
180 | elif [ "$OS" = "Windows" ]; then
181 | 	echo "Windows DLL validation..."
182 | 	file "$DIST_DIR"/*.dll 2>/dev/null || echo "  DLLs present"
183 | fi
184 | 
185 | # 8. Calculate total size
186 | echo ""
187 | echo "=== Distribution Size ==="
188 | echo "Total library size: $(du -sh "$DIST_DIR" | cut -f1)"
189 | echo "Distribution complete: $DIST_DIR"
190 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: Build Cross-Platform
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main]
  6 |     paths:
  7 |       - "ffi/**"
  8 |       - ".github/workflows/build.yml"
  9 |       - ".github/workflows/scripts/**"
 10 |   pull_request:
 11 |     branches: [main]
 12 |     paths:
 13 |       - "ffi/**"
 14 |       - ".github/workflows/build.yml"
 15 |       - ".github/workflows/scripts/**"
 16 |   workflow_dispatch:
 17 |   workflow_call:
 18 |     outputs:
 19 |       run_id:
 20 |         description: "The run ID of this workflow"
 21 |         value: ${{ github.run_id }}
 22 | 
 23 | concurrency:
 24 |   group: ${{ github.workflow }}-${{ github.ref }}
 25 |   cancel-in-progress: true
 26 | 
 27 | env:
 28 |   RUST_VERSION: "1.90.0"
 29 |   CARGO_TERM_COLOR: always
 30 | 
 31 | jobs:
 32 |   build:
 33 |     name: Build ${{ matrix.platform }}
 34 |     runs-on: ${{ matrix.os }}
 35 |     strategy:
 36 |       fail-fast: false
 37 |       matrix:
 38 |         include:
 39 |           - os: ubuntu-latest
 40 |             platform: linux_amd64
 41 |             target: x86_64-unknown-linux-gnu
 42 |             lib_ext: so
 43 | 
 44 |           - os: windows-latest
 45 |             platform: windows_amd64
 46 |             target: x86_64-pc-windows-msvc
 47 |             lib_ext: dll
 48 | 
 49 |           - os: macos-latest
 50 |             platform: darwin_arm64
 51 |             target: aarch64-apple-darwin
 52 |             lib_ext: dylib
 53 | 
 54 |     steps:
 55 |       - name: Checkout
 56 |         uses: actions/checkout@v4
 57 | 
 58 |       # Platform Setup
 59 |       - name: Setup MSVC (Windows)
 60 |         if: runner.os == 'Windows'
 61 |         uses: ilammy/msvc-dev-cmd@v1
 62 |         with:
 63 |           arch: x64
 64 | 
 65 |       - name: Install build dependencies (Linux)
 66 |         if: runner.os == 'Linux'
 67 |         run: |
 68 |           sudo apt-get update
 69 |           sudo apt-get install -y build-essential pkg-config libssl-dev tree
 70 | 
 71 |       # GraalVM Setup - Platform specific
 72 |       # https://github.com/oracle/graal/issues/4921
 73 |       - name: Setup GraalVM (macOS)
 74 |         if: runner.os == 'macOS'
 75 |         uses: graalvm/setup-graalvm@v1
 76 |         with:
 77 |           java-version: "23"
 78 |           distribution: "liberica"  # Use Liberica NIK for macOS for AWT metadata support
 79 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 80 |           native-image-job-reports: "false"
 81 |           set-java-home: "true"
 82 | 
 83 |       - name: Setup GraalVM (Linux/Windows)
 84 |         if: runner.os != 'macOS'
 85 |         uses: graalvm/setup-graalvm@v1
 86 |         with:
 87 |           java-version: "23"
 88 |           distribution: "graalvm-community"  # Keep GraalVM CE for Linux/Windows
 89 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 90 |           native-image-job-reports: "false"
 91 |           set-java-home: "true"
 92 | 
 93 |       - name: Verify GraalVM
 94 |         shell: bash
 95 |         run: |
 96 |           if [ "${{ runner.os }}" = "Windows" ]; then
 97 |             # On Windows, convert path and use .cmd extension
 98 |             GRAALVM_HOME_UNIX=$(cygpath "$GRAALVM_HOME")
 99 |             JAVA_HOME_UNIX=$(cygpath "$JAVA_HOME")
100 |             echo "GRAALVM_HOME (Windows): $GRAALVM_HOME"
101 |             echo "GRAALVM_HOME (Unix): $GRAALVM_HOME_UNIX"
102 |             echo "JAVA_HOME (Windows): $JAVA_HOME"
103 |             echo "JAVA_HOME (Unix): $JAVA_HOME_UNIX"
104 |             
105 |             # Call native-image with Windows path
106 |             "$GRAALVM_HOME/bin/native-image.cmd" --version
107 |           else
108 |             # Unix systems work normally
109 |             echo "GRAALVM_HOME: $GRAALVM_HOME"
110 |             echo "JAVA_HOME: $JAVA_HOME"
111 |             java --version
112 |             native-image --version
113 |           fi
114 | 
115 |       # Rust Setup
116 |       - name: Setup Rust
117 |         uses: dtolnay/rust-toolchain@stable
118 |         with:
119 |           toolchain: ${{ env.RUST_VERSION }}
120 |           targets: ${{ matrix.target }}
121 | 
122 |       # Aggressive Caching Strategy
123 |       - name: Cache Rust dependencies
124 |         uses: Swatinem/rust-cache@v2
125 |         with:
126 |           prefix-key: "v2-rust"
127 |           shared-key: ${{ matrix.target }}
128 |           workspaces: "./ffi"
129 |           cache-on-failure: true
130 |           save-if: ${{ github.ref == 'refs/heads/main' }}
131 | 
132 |       - name: Cache GraalVM Native Image
133 |         id: cache-graalvm
134 |         uses: actions/cache@v4
135 |         with:
136 |           path: |
137 |             ffi/target/${{ matrix.target }}/release/build/extractous-*/out
138 |           key: graalvm-v5-${{ runner.os }}-${{ matrix.target }}-${{ hashFiles('ffi/Cargo.toml') }}
139 |           restore-keys: |
140 |             graalvm-v5-${{ runner.os }}-${{ matrix.target }}-
141 | 
142 |       - name: Build FFI library (Unix)
143 |         if: runner.os != 'Windows'
144 |         working-directory: ./ffi
145 |         shell: bash
146 |         run: |
147 |           cargo build --release --target ${{ matrix.target }}
148 |         env:
149 |           RUST_BACKTRACE: 1
150 | 
151 |       # Build on Windows using PowerShell to avoid PATH issues with link.exe
152 |       - name: Build FFI library (Windows)
153 |         if: runner.os == 'Windows'
154 |         working-directory: ./ffi
155 |         shell: pwsh
156 |         run: |
157 |           cargo build --release --target ${{ matrix.target }}
158 |         env:
159 |           RUST_BACKTRACE: 1
160 | 
161 |       # Collect Dependencies
162 |       - name: Collect libraries
163 |         shell: bash
164 |         run: |
165 |           chmod +x .github/workflows/scripts/collect-libs.sh
166 |           .github/workflows/scripts/collect-libs.sh ${{ matrix.platform }} ${{ matrix.target }} ${{ matrix.lib_ext }}
167 | 
168 |       # Strip Debug Symbols
169 |       - name: Optimize libraries (Unix)
170 |         if: runner.os != 'Windows'
171 |         shell: bash
172 |         run: |
173 |           if [ "${{ runner.os }}" = "Linux" ]; then
174 |             find dist/${{ matrix.platform }} -name "*.${{ matrix.lib_ext }}" -exec strip --strip-debug {} \;
175 |           elif [ "${{ runner.os }}" = "macOS" ]; then
176 |             find dist/${{ matrix.platform }} -name "*.${{ matrix.lib_ext }}" -exec strip -x {} \;
177 |           fi
178 |           echo "Optimized size: $(du -sh dist/${{ matrix.platform }}/lib/ | cut -f1)"
179 | 
180 |       # Upload Artifacts
181 |       - name: Upload artifacts
182 |         uses: actions/upload-artifact@v4
183 |         with:
184 |           name: extractous-ffi-${{ matrix.platform }}
185 |           path: dist/${{ matrix.platform }}
186 |           retention-days: 7
187 |           compression-level: 9
188 |           if-no-files-found: error
189 | 


--------------------------------------------------------------------------------
/stream.go:
--------------------------------------------------------------------------------
  1 | package extractous
  2 | 
  3 | /*
  4 | #cgo CFLAGS: -I${SRCDIR}/../include
  5 | 
  6 | #include <extractous.h>
  7 | #include <stdlib.h>
  8 | */
  9 | import "C"
 10 | import (
 11 | 	"io"
 12 | 	"runtime"
 13 | 	"unsafe"
 14 | )
 15 | 
 16 | // StreamReader implements io.Reader for streaming document content.
 17 | //
 18 | // StreamReader provides efficient streaming access to extracted document content,
 19 | // allowing you to process large documents without loading everything into memory.
 20 | // It implements the standard io.Reader interface and can be used with any Go code
 21 | // that works with readers.
 22 | //
 23 | // # Interface Compliance
 24 | //
 25 | // StreamReader implements:
 26 | //   - io.Reader: Read(p []byte) (n int, err error)
 27 | //   - io.Closer: Close() error
 28 | //
 29 | // This means it can be used with:
 30 | //   - io.Copy, io.ReadAll, io.ReadFull
 31 | //   - bufio.NewReader, bufio.NewScanner
 32 | //   - io.TeeReader, io.LimitReader
 33 | //   - Any function accepting io.Reader or io.ReadCloser
 34 | //
 35 | // # Resource Management
 36 | //
 37 | // StreamReaders must be closed when done to free underlying resources. While they
 38 | // use finalizers for automatic cleanup, calling Close() explicitly is strongly
 39 | // recommended:
 40 | //
 41 | //	reader, metadata, err := extractor.ExtractFile("document.pdf")
 42 | //	if err != nil {
 43 | //	    log.Fatal(err)
 44 | //	}
 45 | //	defer reader.Close() // Always close
 46 | //
 47 | // # Usage Patterns
 48 | //
 49 | // Copy to stdout:
 50 | //
 51 | //	reader, _, _ := extractor.ExtractFile("document.pdf")
 52 | //	defer reader.Close()
 53 | //	io.Copy(os.Stdout, reader)
 54 | //
 55 | // Read in chunks:
 56 | //
 57 | //	reader, _, _ := extractor.ExtractFile("document.pdf")
 58 | //	defer reader.Close()
 59 | //	buf := make([]byte, 4096)
 60 | //	for {
 61 | //	    n, err := reader.Read(buf)
 62 | //	    if err == io.EOF {
 63 | //	        break
 64 | //	    }
 65 | //	    if err != nil {
 66 | //	        log.Fatal(err)
 67 | //	    }
 68 | //	    process(buf[:n])
 69 | //	}
 70 | //
 71 | // Use with bufio.Scanner:
 72 | //
 73 | //	reader, _, _ := extractor.ExtractFile("document.pdf")
 74 | //	defer reader.Close()
 75 | //	scanner := bufio.NewScanner(reader)
 76 | //	for scanner.Scan() {
 77 | //	    line := scanner.Text()
 78 | //	    fmt.Println(line)
 79 | //	}
 80 | //
 81 | // Read all at once (for moderate-sized documents):
 82 | //
 83 | //	reader, _, _ := extractor.ExtractFile("document.pdf")
 84 | //	defer reader.Close()
 85 | //	content, err := io.ReadAll(reader)
 86 | //	if err != nil {
 87 | //	    log.Fatal(err)
 88 | //	}
 89 | //	fmt.Println(string(content))
 90 | //
 91 | // # Performance Considerations
 92 | //
 93 | // StreamReader is buffered at the FFI layer, so you don't need to wrap it with
 94 | // bufio.Reader for basic read operations. However, bufio.Scanner can still be
 95 | // useful for line-oriented processing.
 96 | //
 97 | // Typical buffer sizes:
 98 | //   - Small reads (< 512 bytes): May have overhead, prefer larger reads
 99 | //   - Medium reads (4KB - 64KB): Optimal for most use cases
100 | //   - Large reads (> 1MB): Generally no advantage over medium reads
101 | //
102 | // # Thread Safety
103 | //
104 | // StreamReader is NOT safe for concurrent use. Do not call Read() from multiple
105 | // goroutines simultaneously.
106 | type StreamReader struct {
107 | 	ptr    *C.struct_CStreamReader // FFI stream pointer
108 | 	closed bool                    // Whether Close() has been called
109 | }
110 | 
111 | // newStreamReader creates a StreamReader from a C pointer.
112 | //
113 | // This is an internal function used to wrap FFI stream pointers. It sets up
114 | // a finalizer for automatic resource cleanup.
115 | //
116 | // Returns nil if the C pointer is nil.
117 | //
118 | // Internal use only.
119 | func newStreamReader(ptr *C.struct_CStreamReader) *StreamReader {
120 | 	if ptr == nil {
121 | 		return nil
122 | 	}
123 | 
124 | 	reader := &StreamReader{ptr: ptr}
125 | 	runtime.SetFinalizer(reader, (*StreamReader).Close)
126 | 	return reader
127 | }
128 | 
129 | // Read reads up to len(p) bytes into p.
130 | //
131 | // This implements the io.Reader interface. It reads extracted content from the
132 | // document into the provided byte slice.
133 | //
134 | // Parameters:
135 | //   - p: Byte slice to read into (must not be nil or empty for meaningful reads)
136 | //
137 | // Returns:
138 | //   - n: Number of bytes read (0 <= n <= len(p))
139 | //   - err: Error if read failed, or io.EOF when stream is exhausted
140 | //
141 | // # Behavior
142 | //
143 | // Read may return fewer bytes than requested (0 < n < len(p)) without error.
144 | // This is normal io.Reader behavior and does not indicate an error or EOF.
145 | //
146 | // Read returns io.EOF when no more data is available. After receiving io.EOF,
147 | // all subsequent calls will return (0, io.EOF).
148 | //
149 | // If the reader has been closed, Read returns (0, io.EOF).
150 | //
151 | // # Example
152 | //
153 | //	reader, _, err := extractor.ExtractFile("document.pdf")
154 | //	if err != nil {
155 | //	    log.Fatal(err)
156 | //	}
157 | //	defer reader.Close()
158 | //
159 | //	buf := make([]byte, 4096)
160 | //	for {
161 | //	    n, err := reader.Read(buf)
162 | //	    if n > 0 {
163 | //	        // Process buf[:n]
164 | //	        fmt.Print(string(buf[:n]))
165 | //	    }
166 | //	    if err == io.EOF {
167 | //	        break
168 | //	    }
169 | //	    if err != nil {
170 | //	        log.Fatal(err)
171 | //	    }
172 | //	}
173 | //
174 | // # Error Handling
175 | //
176 | // Errors other than io.EOF indicate actual read failures and should be handled:
177 | //
178 | //	n, err := reader.Read(buf)
179 | //	if err != nil && err != io.EOF {
180 | //	    log.Printf("Read error: %v", err)
181 | //	    return
182 | //	}
183 | func (r *StreamReader) Read(p []byte) (n int, err error) {
184 | 	if r.closed || r.ptr == nil {
185 | 		return 0, io.EOF
186 | 	}
187 | 
188 | 	if len(p) == 0 {
189 | 		return 0, nil
190 | 	}
191 | 
192 | 	var bytesRead C.size_t
193 | 	code := C.extractous_stream_read(
194 | 		r.ptr,
195 | 		(*C.uint8_t)(unsafe.Pointer(&p[0])),
196 | 		C.size_t(len(p)),
197 | 		&bytesRead,
198 | 	)
199 | 
200 | 	if code != errOK {
201 | 		return 0, newError(code)
202 | 	}
203 | 
204 | 	if bytesRead == 0 {
205 | 		return 0, io.EOF
206 | 	}
207 | 
208 | 	return int(bytesRead), nil
209 | }
210 | 
211 | // Close closes the stream and releases underlying resources.
212 | //
213 | // This implements the io.Closer interface. After calling Close, the StreamReader
214 | // should not be used. Subsequent calls to Read will return (0, io.EOF).
215 | //
216 | // Calling Close multiple times is safe - subsequent calls are no-ops and return
217 | // nil.
218 | //
219 | // While StreamReaders use finalizers for automatic cleanup, calling Close
220 | // explicitly is strongly recommended for deterministic resource management,
221 | // especially when processing many documents.
222 | //
223 | // Returns:
224 | //   - Always returns nil (implements io.Closer)
225 | //
226 | // Example:
227 | //
228 | //	reader, _, err := extractor.ExtractFile("document.pdf")
229 | //	if err != nil {
230 | //	    log.Fatal(err)
231 | //	}
232 | //	defer reader.Close() // Ensure cleanup
233 | //
234 | //	// Use reader...
235 | //	io.Copy(os.Stdout, reader)
236 | //
237 | //	// Explicit close (defer will call again, which is safe)
238 | //	reader.Close()
239 | //
240 | // # Resource Management Best Practices
241 | //
242 | // Always use defer:
243 | //
244 | //	reader, _, err := extractor.ExtractFile("doc.pdf")
245 | //	if err != nil {
246 | //	    return err
247 | //	}
248 | //	defer reader.Close() // Cleanup even if function panics
249 | //
250 | // For long-running processes, close explicitly in loops:
251 | //
252 | //	for _, file := range files {
253 | //	    reader, _, err := extractor.ExtractFile(file)
254 | //	    if err != nil {
255 | //	        log.Printf("Error: %v", err)
256 | //	        continue
257 | //	    }
258 | //
259 | //	    processStream(reader)
260 | //	    reader.Close() // Don't wait for defer in loop
261 | //	}
262 | func (r *StreamReader) Close() error {
263 | 	if r.closed || r.ptr == nil {
264 | 		return nil
265 | 	}
266 | 
267 | 	C.extractous_stream_free(r.ptr)
268 | 	r.ptr = nil
269 | 	r.closed = true
270 | 	return nil
271 | }
272 | 


--------------------------------------------------------------------------------
/ffi/src/config.rs:
--------------------------------------------------------------------------------
  1 | use crate::ecore::{
  2 |     OfficeParserConfig as CoreOfficeConfig, PdfOcrStrategy, PdfParserConfig as CorePdfConfig,
  3 |     TesseractOcrConfig as CoreOcrConfig,
  4 | };
  5 | use crate::types::*;
  6 | use std::ffi::CStr;
  7 | use std::os::raw::c_char;
  8 | use std::ptr;
  9 | 
 10 | /// Macro to safely update a config instance behind a raw pointer.
 11 | macro_rules! update_config {
 12 |     ($handle:expr, $T:ty, |$config_val:ident| $body:block) => {
 13 |         if $handle.is_null() {
 14 |             return;
 15 |         }
 16 |         unsafe {
 17 |             let config_ptr = $handle as *mut $T;
 18 |             let old_config = ptr::read(config_ptr);
 19 |             let new_config = {
 20 |                 let $config_val = old_config;
 21 |                 $body
 22 |             };
 23 |             ptr::write(config_ptr, new_config);
 24 |         }
 25 |     };
 26 | }
 27 | 
 28 | /// Creates a new PDF parser configuration with default settings.
 29 | /// The returned handle must be freed with `extractous_pdf_config_free()`
 30 | /// unless passed to an extractor, which will take ownership.
 31 | // #[must_use]
 32 | #[unsafe(no_mangle)]
 33 | pub extern "C" fn extractous_pdf_config_new() -> *mut CPdfParserConfig {
 34 |     let config = Box::new(CorePdfConfig::new());
 35 |     Box::into_raw(config) as *mut CPdfParserConfig
 36 | }
 37 | 
 38 | /// Frees the memory associated with a PDF parser configuration.
 39 | /// Do not call this if the config has been attached to an extractor.
 40 | #[unsafe(no_mangle)]
 41 | pub unsafe extern "C" fn extractous_pdf_config_free(handle: *mut CPdfParserConfig) {
 42 |     if !handle.is_null() {
 43 |         drop(unsafe { Box::from_raw(handle as *mut CorePdfConfig) });
 44 |     }
 45 | }
 46 | 
 47 | /// Sets the OCR strategy for PDF parsing. Modifies the config in-place.
 48 | #[unsafe(no_mangle)]
 49 | pub unsafe extern "C" fn extractous_pdf_config_set_ocr_strategy(
 50 |     handle: *mut CPdfParserConfig,
 51 |     strategy: libc::c_int,
 52 | ) {
 53 |     let ocr_strategy = match strategy {
 54 |         PDF_OCR_STRATEGY_NO_OCR => PdfOcrStrategy::NO_OCR,
 55 |         PDF_OCR_STRATEGY_OCR_ONLY => PdfOcrStrategy::OCR_ONLY,
 56 |         PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION => PdfOcrStrategy::OCR_AND_TEXT_EXTRACTION,
 57 |         PDF_OCR_STRATEGY_AUTO => PdfOcrStrategy::AUTO,
 58 |         _ => return, // Invalid strategy, do nothing.
 59 |     };
 60 |     update_config!(handle, CorePdfConfig, |config| {
 61 |         config.set_ocr_strategy(ocr_strategy)
 62 |     });
 63 | }
 64 | 
 65 | /// Enables or disables extraction of inline images. Modifies the config in-place.
 66 | #[unsafe(no_mangle)]
 67 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_inline_images(
 68 |     handle: *mut CPdfParserConfig,
 69 |     value: bool,
 70 | ) {
 71 |     update_config!(handle, CorePdfConfig, |config| {
 72 |         config.set_extract_inline_images(value)
 73 |     });
 74 | }
 75 | 
 76 | /// If enabled, only unique inline images (by digest) will be extracted.
 77 | #[unsafe(no_mangle)]
 78 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_unique_inline_images_only(
 79 |     handle: *mut CPdfParserConfig,
 80 |     value: bool,
 81 | ) {
 82 |     update_config!(handle, CorePdfConfig, |config| {
 83 |         config.set_extract_unique_inline_images_only(value)
 84 |     });
 85 | }
 86 | 
 87 | /// Enables or disables extraction of text from marked content sections.
 88 | #[unsafe(no_mangle)]
 89 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_marked_content(
 90 |     handle: *mut CPdfParserConfig,
 91 |     value: bool,
 92 | ) {
 93 |     update_config!(handle, CorePdfConfig, |config| {
 94 |         config.set_extract_marked_content(value)
 95 |     });
 96 | }
 97 | 
 98 | /// Enables or disables extraction of text from annotations.
 99 | #[unsafe(no_mangle)]
100 | pub unsafe extern "C" fn extractous_pdf_config_set_extract_annotation_text(
101 |     handle: *mut CPdfParserConfig,
102 |     value: bool,
103 | ) {
104 |     update_config!(handle, CorePdfConfig, |config| {
105 |         config.set_extract_annotation_text(value)
106 |     });
107 | }
108 | 
109 | /// Creates a new Office parser configuration with default settings.
110 | // #[must_use]
111 | #[unsafe(no_mangle)]
112 | pub extern "C" fn extractous_office_config_new() -> *mut COfficeParserConfig {
113 |     let config = Box::new(CoreOfficeConfig::new());
114 |     Box::into_raw(config) as *mut COfficeParserConfig
115 | }
116 | 
117 | /// Frees the memory associated with an Office parser configuration.
118 | #[unsafe(no_mangle)]
119 | pub unsafe extern "C" fn extractous_office_config_free(handle: *mut COfficeParserConfig) {
120 |     if !handle.is_null() {
121 |         drop(unsafe { Box::from_raw(handle as *mut CoreOfficeConfig) });
122 |     }
123 | }
124 | 
125 | /// Enables or disables macro extraction. Modifies the config in-place.
126 | #[unsafe(no_mangle)]
127 | pub unsafe extern "C" fn extractous_office_config_set_extract_macros(
128 |     handle: *mut COfficeParserConfig,
129 |     value: bool,
130 | ) {
131 |     update_config!(handle, CoreOfficeConfig, |config| {
132 |         config.set_extract_macros(value)
133 |     });
134 | }
135 | 
136 | /// Enables or disables inclusion of deleted content (track changes).
137 | #[unsafe(no_mangle)]
138 | pub unsafe extern "C" fn extractous_office_config_set_include_deleted_content(
139 |     handle: *mut COfficeParserConfig,
140 |     value: bool,
141 | ) {
142 |     update_config!(handle, CoreOfficeConfig, |config| {
143 |         config.set_include_deleted_content(value)
144 |     });
145 | }
146 | 
147 | /// Enables or disables inclusion of moved-from content (track changes).
148 | #[unsafe(no_mangle)]
149 | pub unsafe extern "C" fn extractous_office_config_set_include_move_from_content(
150 |     handle: *mut COfficeParserConfig,
151 |     value: bool,
152 | ) {
153 |     update_config!(handle, CoreOfficeConfig, |config| {
154 |         config.set_include_move_from_content(value)
155 |     });
156 | }
157 | 
158 | /// Enables or disables inclusion of content from shapes.
159 | #[unsafe(no_mangle)]
160 | pub unsafe extern "C" fn extractous_office_config_set_include_shape_based_content(
161 |     handle: *mut COfficeParserConfig,
162 |     value: bool,
163 | ) {
164 |     update_config!(handle, CoreOfficeConfig, |config| {
165 |         config.set_include_shape_based_content(value)
166 |     });
167 | }
168 | 
169 | /// Creates a new Tesseract OCR configuration with default settings.
170 | // #[must_use]
171 | #[unsafe(no_mangle)]
172 | pub extern "C" fn extractous_ocr_config_new() -> *mut CTesseractOcrConfig {
173 |     let config = Box::new(CoreOcrConfig::new());
174 |     Box::into_raw(config) as *mut CTesseractOcrConfig
175 | }
176 | 
177 | /// Frees the memory associated with a Tesseract OCR configuration.
178 | #[unsafe(no_mangle)]
179 | pub unsafe extern "C" fn extractous_ocr_config_free(handle: *mut CTesseractOcrConfig) {
180 |     if !handle.is_null() {
181 |         drop(unsafe { Box::from_raw(handle as *mut CoreOcrConfig) });
182 |     }
183 | }
184 | 
185 | /// Sets the OCR language. Modifies the config in-place.
186 | #[unsafe(no_mangle)]
187 | pub unsafe extern "C" fn extractous_ocr_config_set_language(
188 |     handle: *mut CTesseractOcrConfig,
189 |     language: *const c_char,
190 | ) {
191 |     if language.is_null() {
192 |         return;
193 |     }
194 |     let lang_str = match unsafe { CStr::from_ptr(language).to_str() } {
195 |         Ok(s) => s,
196 |         Err(_) => return, // Invalid UTF-8, do nothing.
197 |     };
198 |     update_config!(handle, CoreOcrConfig, |config| {
199 |         config.set_language(lang_str)
200 |     });
201 | }
202 | 
203 | /// Sets the DPI for OCR processing. Modifies the config in-place.
204 | #[unsafe(no_mangle)]
205 | pub unsafe extern "C" fn extractous_ocr_config_set_density(
206 |     handle: *mut CTesseractOcrConfig,
207 |     density: i32,
208 | ) {
209 |     update_config!(handle, CoreOcrConfig, |config| {
210 |         config.set_density(density)
211 |     });
212 | }
213 | 
214 | /// Sets the bit depth for OCR processing.
215 | #[unsafe(no_mangle)]
216 | pub unsafe extern "C" fn extractous_ocr_config_set_depth(
217 |     handle: *mut CTesseractOcrConfig,
218 |     depth: i32,
219 | ) {
220 |     update_config!(handle, CoreOcrConfig, |config| { config.set_depth(depth) });
221 | }
222 | 
223 | /// Enables or disables image preprocessing for OCR.
224 | #[unsafe(no_mangle)]
225 | pub unsafe extern "C" fn extractous_ocr_config_set_enable_image_preprocessing(
226 |     handle: *mut CTesseractOcrConfig,
227 |     value: bool,
228 | ) {
229 |     update_config!(handle, CoreOcrConfig, |config| {
230 |         config.set_enable_image_preprocessing(value)
231 |     });
232 | }
233 | 
234 | /// Sets the timeout for the Tesseract process in seconds.
235 | #[unsafe(no_mangle)]
236 | pub unsafe extern "C" fn extractous_ocr_config_set_timeout_seconds(
237 |     handle: *mut CTesseractOcrConfig,
238 |     seconds: i32,
239 | ) {
240 |     update_config!(handle, CoreOcrConfig, |config| {
241 |         config.set_timeout_seconds(seconds)
242 |     });
243 | }
244 | 


--------------------------------------------------------------------------------
/metadata.go:
--------------------------------------------------------------------------------
  1 | package extractous
  2 | 
  3 | /*
  4 | #include <extractous.h>
  5 | #include <stdlib.h>
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"runtime"
 10 | 	"strings"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | // Metadata represents document metadata as key-value pairs.
 15 | //
 16 | // Metadata contains information about a document such as author, title, creation
 17 | // date, modification date, and other document properties. Each metadata field can
 18 | // have multiple values, so values are stored as string slices.
 19 | //
 20 | // # Common Metadata Fields
 21 | //
 22 | // Different document formats provide different metadata fields. Common fields
 23 | // include:
 24 | //
 25 | //   - "title" - Document title
 26 | //   - "author" - Document author(s)
 27 | //   - "creator" - Application that created the document
 28 | //   - "producer" - Application that produced the PDF (for PDFs)
 29 | //   - "subject" - Document subject/description
 30 | //   - "keywords" - Document keywords
 31 | //   - "created" - Creation date/time
 32 | //   - "modified" - Last modification date/time
 33 | //   - "Content-Type" - MIME type of the document
 34 | //   - "dc:title" - Dublin Core title (some formats)
 35 | //   - "dc:creator" - Dublin Core creator (some formats)
 36 | //
 37 | // # Multi-valued Fields
 38 | //
 39 | // Some fields can have multiple values, particularly "author" and "keywords":
 40 | //
 41 | //	metadata := Metadata{
 42 | //	    "author": []string{"Alice", "Bob"},
 43 | //	    "keywords": []string{"report", "quarterly", "finance"},
 44 | //	}
 45 | //
 46 | // # Case Sensitivity
 47 | //
 48 | // Metadata keys are case-sensitive. Some formats use lowercase keys ("author"),
 49 | // others use mixed case ("Author" or "dc:creator"). Always check the actual keys
 50 | // returned from extraction.
 51 | //
 52 | // # Usage Examples
 53 | //
 54 | // Basic access:
 55 | //
 56 | //	content, metadata, err := extractor.ExtractFileToString("document.pdf")
 57 | //	if err != nil {
 58 | //	    log.Fatal(err)
 59 | //	}
 60 | //
 61 | //	// Get single value
 62 | //	title := metadata.Get("title")
 63 | //	author := metadata.Get("author")
 64 | //
 65 | //	// Get all values
 66 | //	allAuthors := metadata.GetAll("author")
 67 | //	for _, author := range allAuthors {
 68 | //	    fmt.Println("Author:", author)
 69 | //	}
 70 | //
 71 | //	// Check existence
 72 | //	if metadata.Has("keywords") {
 73 | //	    keywords := metadata.GetAll("keywords")
 74 | //	    fmt.Println("Keywords:", keywords)
 75 | //	}
 76 | //
 77 | // Iterate all metadata:
 78 | //
 79 | //	for _, key := range metadata.Keys() {
 80 | //	    values := metadata.GetAll(key)
 81 | //	    fmt.Printf("%s: %v\n", key, values)
 82 | //	}
 83 | //
 84 | // # Empty Metadata
 85 | //
 86 | // If a document has no metadata, an empty Metadata map is returned (not nil).
 87 | // Always safe to call methods on Metadata even when empty.
 88 | type Metadata map[string][]string
 89 | 
 90 | // metadataWrapper wraps C metadata for proper cleanup.
 91 | //
 92 | // This is an internal type used to manage the lifecycle of C metadata pointers.
 93 | // It ensures that C resources are freed when the Go garbage collector determines
 94 | // they are no longer needed.
 95 | //
 96 | // Internal use only.
 97 | type metadataWrapper struct {
 98 | 	ptr *C.struct_CMetadata
 99 | }
100 | 
101 | // newMetadata converts C metadata to Go and sets up cleanup.
102 | //
103 | // This function:
104 | //  1. Converts C metadata structure to Go map
105 | //  2. Sets up a finalizer to free C resources
106 | //  3. Splits comma-separated values into slices
107 | //  4. Trims whitespace from values
108 | //
109 | // Returns an empty Metadata map if the C pointer is nil.
110 | //
111 | // Internal use only.
112 | func newMetadata(cMeta *C.struct_CMetadata) Metadata {
113 | 	if cMeta == nil {
114 | 		return make(Metadata)
115 | 	}
116 | 
117 | 	// Create wrapper for proper cleanup
118 | 	wrapper := &metadataWrapper{ptr: cMeta}
119 | 	runtime.SetFinalizer(wrapper, (*metadataWrapper).free)
120 | 
121 | 	// Convert to Go map
122 | 	result := make(Metadata, int(cMeta.len))
123 | 
124 | 	if cMeta.len == 0 {
125 | 		return result
126 | 	}
127 | 
128 | 	// Convert C arrays to Go slices
129 | 	keys := unsafe.Slice(cMeta.keys, cMeta.len)
130 | 	values := unsafe.Slice(cMeta.values, cMeta.len)
131 | 
132 | 	for i := 0; i < int(cMeta.len); i++ {
133 | 		key := C.GoString(keys[i])
134 | 		value := C.GoString(values[i])
135 | 
136 | 		// Values are comma-separated in C, split them into individual values
137 | 		valueSlice := strings.Split(value, ",")
138 | 		// Trim whitespace from each value
139 | 		for j := range valueSlice {
140 | 			valueSlice[j] = strings.TrimSpace(valueSlice[j])
141 | 		}
142 | 		result[key] = valueSlice
143 | 	}
144 | 
145 | 	return result
146 | }
147 | 
148 | // free releases C metadata resources.
149 | //
150 | // This is called automatically by the garbage collector via the finalizer.
151 | // Application code should not call this directly.
152 | //
153 | // Internal use only.
154 | func (m *metadataWrapper) free() {
155 | 	if m.ptr != nil {
156 | 		C.extractous_metadata_free(m.ptr)
157 | 		m.ptr = nil
158 | 	}
159 | }
160 | 
161 | // Get returns the first value for a metadata key.
162 | //
163 | // If the key exists and has one or more values, the first value is returned.
164 | // If the key doesn't exist or has no values, an empty string is returned.
165 | //
166 | // This is the most convenient method for metadata fields that typically have
167 | // a single value (like "title", "author", "created").
168 | //
169 | // Parameters:
170 | //   - key: Metadata field name (case-sensitive)
171 | //
172 | // Returns:
173 | //   - First value as a string, or "" if not found
174 | //
175 | // Example:
176 | //
177 | //	title := metadata.Get("title")
178 | //	if title == "" {
179 | //	    fmt.Println("No title")
180 | //	} else {
181 | //	    fmt.Println("Title:", title)
182 | //	}
183 | //
184 | //	// For potentially multi-valued fields, Get returns first value
185 | //	firstAuthor := metadata.Get("author")
186 | func (m Metadata) Get(key string) string {
187 | 	if vals, ok := m[key]; ok && len(vals) > 0 {
188 | 		return vals[0]
189 | 	}
190 | 	return ""
191 | }
192 | 
193 | // GetAll returns all values for a metadata key.
194 | //
195 | // Some metadata fields can have multiple values (particularly "author" and
196 | // "keywords"). This method returns all values as a slice.
197 | //
198 | // If the key doesn't exist, nil is returned (not an empty slice).
199 | //
200 | // Parameters:
201 | //   - key: Metadata field name (case-sensitive)
202 | //
203 | // Returns:
204 | //   - Slice of all values, or nil if key not found
205 | //
206 | // Example:
207 | //
208 | //	// Get all authors
209 | //	authors := metadata.GetAll("author")
210 | //	if authors != nil {
211 | //	    for _, author := range authors {
212 | //	        fmt.Println("Author:", author)
213 | //	    }
214 | //	}
215 | //
216 | //	// Get all keywords
217 | //	keywords := metadata.GetAll("keywords")
218 | //	if keywords != nil {
219 | //	    fmt.Println("Keywords:", strings.Join(keywords, ", "))
220 | //	}
221 | //
222 | //	// Check for nil vs empty
223 | //	if metadata.GetAll("nonexistent") == nil {
224 | //	    fmt.Println("Key not found")
225 | //	}
226 | func (m Metadata) GetAll(key string) []string {
227 | 	return m[key]
228 | }
229 | 
230 | // Has checks if a metadata key exists.
231 | //
232 | // Returns true if the key exists in the metadata, even if it has empty values.
233 | // Returns false if the key doesn't exist.
234 | //
235 | // This is useful for distinguishing between a missing key and a key with an
236 | // empty value.
237 | //
238 | // Parameters:
239 | //   - key: Metadata field name (case-sensitive)
240 | //
241 | // Returns:
242 | //   - true if key exists, false otherwise
243 | //
244 | // Example:
245 | //
246 | //	if metadata.Has("author") {
247 | //	    author := metadata.Get("author")
248 | //	    if author == "" {
249 | //	        fmt.Println("Author field exists but is empty")
250 | //	    } else {
251 | //	        fmt.Println("Author:", author)
252 | //	    }
253 | //	} else {
254 | //	    fmt.Println("No author field")
255 | //	}
256 | //
257 | //	// Conditional processing
258 | //	if metadata.Has("keywords") {
259 | //	    processKeywords(metadata.GetAll("keywords"))
260 | //	}
261 | func (m Metadata) Has(key string) bool {
262 | 	_, ok := m[key]
263 | 	return ok
264 | }
265 | 
266 | // Keys returns all metadata keys.
267 | //
268 | // Returns a slice of all keys present in the metadata. The order is not
269 | // guaranteed and may vary between calls.
270 | //
271 | // This is useful for iterating over all metadata fields without knowing the
272 | // specific keys in advance.
273 | //
274 | // Returns:
275 | //   - Slice of all metadata keys (may be empty but never nil)
276 | //
277 | // Example:
278 | //
279 | //	// Print all metadata
280 | //	for _, key := range metadata.Keys() {
281 | //	    values := metadata.GetAll(key)
282 | //	    fmt.Printf("%s: %v\n", key, values)
283 | //	}
284 | //
285 | //	// Count metadata fields
286 | //	fmt.Printf("Found %d metadata fields\n", len(metadata.Keys()))
287 | //
288 | //	// Filter specific fields
289 | //	for _, key := range metadata.Keys() {
290 | //	    if strings.HasPrefix(key, "dc:") {
291 | //	        // Process Dublin Core fields
292 | //	        fmt.Printf("%s = %s\n", key, metadata.Get(key))
293 | //	    }
294 | //	}
295 | //
296 | // Note: The returned slice is a new allocation and can be modified without
297 | // affecting the Metadata.
298 | func (m Metadata) Keys() []string {
299 | 	keys := make([]string, 0, len(m))
300 | 	for k := range m {
301 | 		keys = append(keys, k)
302 | 	}
303 | 	return keys
304 | }
305 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Extractous Go
  2 | 
  3 | Go bindings for [Extractous](https://github.com/yobix-ai/extractous) - fast, high-performance, rust-powered document extraction built on Apache Tika and Tesseract OCR.
  4 | 
  5 | [![Go Reference](https://pkg.go.dev/badge/github.com/rahulpoonia29/extractous-go.svg)](https://pkg.go.dev/github.com/rahulpoonia29/extractous-go)
  6 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  7 | [![Build Status](https://github.com/rahulpoonia29/extractous-go/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/rahulpoonia29/extractous-go/actions/workflows/build.yml)
  8 | 
  9 | ---
 10 | 
 11 | ## Features
 12 | 
 13 | - **High Performance**: Built on Rust for maximum throughput and minimal memory overhead
 14 | - **60+ File Formats**: PDF, Office documents (DOCX, XLSX, PPTX), HTML, XML, and more
 15 | - **OCR Support**: Extract text from scanned documents and images using Tesseract
 16 | - **Streaming API**: Process large files with minimal memory usage
 17 | 
 18 | ---
 19 | 
 20 | ## Installation
 21 | 
 22 | ### Step 1: Install the Go Package
 23 | 
 24 | ```bash
 25 | go get github.com/rahulpoonia29/extractous-go
 26 | ```
 27 | 
 28 | ### Step 2: Download Native Libraries
 29 | 
 30 | ```bash
 31 | # Download libraries for your current platform
 32 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest
 33 | 
 34 | # Download for a specific platform
 35 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --platform linux-amd64
 36 | 
 37 | # Download for all platforms (useful for cross-compilation)
 38 | go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --all
 39 | ```
 40 | 
 41 | This creates a `native/` directory with libraries for your specific platform.
 42 | 
 43 | ---
 44 | 
 45 | ## Quick Start
 46 | 
 47 | ### Basic Text Extraction
 48 | 
 49 | ```go
 50 | package main
 51 | 
 52 | import (
 53 |     "fmt"
 54 |     "log"
 55 | 
 56 |     "github.com/rahulpoonia29/extractous-go"
 57 | )
 58 | 
 59 | func main() {
 60 |     // Create a new extractor
 61 |     extractor := extractous.New()
 62 |     if extractor == nil {
 63 |         log.Fatal("Failed to create extractor")
 64 |     }
 65 |     defer extractor.Close()
 66 | 
 67 |     // Extract text and metadata from file
 68 |     content, metadata, err := extractor.ExtractFileToString("document.pdf")
 69 |     if err != nil {
 70 |         log.Fatalf("Extraction failed: %v", err)
 71 |     }
 72 | 
 73 |     // Results
 74 |     fmt.Println("Content:", content)
 75 |     fmt.Println("Metadata:", metadata)
 76 | }
 77 | ```
 78 | 
 79 | ### Streaming Large Files
 80 | 
 81 | For memory efficient processing of large documents:
 82 | 
 83 | ```go
 84 | package main
 85 | 
 86 | import (
 87 |     "fmt"
 88 |     "io"
 89 |     "log"
 90 | 
 91 |     "github.com/rahulpoonia29/extractous-go"
 92 | )
 93 | 
 94 | func main() {
 95 |     extractor := extractous.New()
 96 |     if extractor == nil {
 97 |         log.Fatal("Failed to create extractor")
 98 |     }
 99 |     defer extractor.Close()
100 | 
101 |     // Get a streaming reader for the document
102 |     reader, metadata, err := extractor.ExtractFile("large_document.pdf")
103 |     if err != nil {
104 |         log.Fatal(err)
105 |     }
106 |     defer reader.Close()
107 | 
108 |     // Process the document in chunks
109 |     buffer := make([]byte, 8192)
110 |     for {
111 |         n, err := reader.Read(buffer)
112 |         if err == io.EOF {
113 |             break
114 |         }
115 |         if err != nil {
116 |             log.Fatal(err)
117 |         }
118 | 
119 |         // Process buffer[:n]
120 |         fmt.Printf("Read %d bytes\n", n)
121 |     }
122 | }
123 | ```
124 | 
125 | ### Configuration
126 | 
127 | ```go
128 | package main
129 | 
130 | import (
131 |     "log"
132 | 
133 |     "github.com/rahulpoonia29/extractous-go"
134 | )
135 | 
136 | func main() {
137 |     // Configure PDF extraction with OCR
138 |     pdfConfig := extractous.NewPdfConfig()
139 |     pdfConfig.SetOcrStrategy(extractous.PdfOcrAuto)
140 |     pdfConfig.SetExtractInlineImages(true)
141 |     pdfConfig.SetExtractAnnotationText(true)
142 | 
143 |     // Configure OCR settings
144 |     ocrConfig := extractous.NewTesseractOcrConfig()
145 |     ocrConfig.SetLanguage("eng")
146 |     ocrConfig.SetDensity(300)
147 | 
148 |     // Apply configurations to extractor
149 |     extractor := extractous.New()
150 |     extractor.SetPdfConfig(pdfConfig)
151 |     extractor.SetTesseractOcrConfig(ocrConfig)
152 |     extractor.SetXmlOutput(true) // Enable structured output
153 | 
154 |     defer extractor.Close()
155 | 
156 |     content, _, err := extractor.ExtractFileToString("scanned_document.pdf")
157 |     if err != nil {
158 |         log.Fatal(err)
159 |     }
160 | 
161 |     log.Println(content)
162 | }
163 | ```
164 | 
165 | ---
166 | 
167 | ## Building
168 | 
169 | The library uses CGO to interface with native libraries. Below are platform-specific build instructions.
170 | 
171 | ### Prerequisite
172 | 
173 | - CGO enabled
174 | - Native libraries
175 | - Platform-specific C compiler
176 | 
177 | ### Linux and macOS
178 | 
179 | ```bash
180 | # Set up environment
181 | export CGO_ENABLED=1
182 | export CC=gcc
183 | export CXX=g++
184 | 
185 | # Set library path for the build
186 | export CGO_LDFLAGS="-L$(pwd)/native/$(go env GOOS)_$(go env GOARCH) -lextractous_ffi"
187 | 
188 | # Build the application
189 | go build -o myapp main.go
190 | 
191 | # Set the library path for runtime before executing
192 | export LD_LIBRARY_PATH="$(pwd)/native/$(go env GOOS)_$(go env GOARCH):$LD_LIBRARY_PATH" # For Linux
193 | export DYLD_LIBRARY_PATH="$(pwd)/native/$(go env GOOS)_$(go env GOARCH):$DYLD_LIBRARY_PATH" # For macOS
194 | 
195 | ./myapp
196 | 
197 | ```
198 | 
199 | ### Windows (PowerShell)
200 | 
201 | ```powershell
202 | # Set up environment
203 | $env:CGO_ENABLED = "1"
204 | $env:CC = "gcc"
205 | $env:CXX = "g++"
206 | 
207 | # Set library path for the build
208 | $env:CGO_LDFLAGS = "-L$pwd\native\windows_amd64 -lextractous_ffi" # Only x86-64 is supported
209 | 
210 | # Build the application
211 | go build -o myapp.exe main.go
212 | 
213 | # Add the DLL to the system's path
214 | $env:Path = "$pwd\native\windows_amd64;" + $env:Path
215 | .\myapp.exe
216 | ```
217 | 
218 | ---
219 | 
220 | ## Error Handling
221 | 
222 | ### Basic Error Handling
223 | 
224 | ```go
225 | content, metadata, err := extractor.ExtractFileToString("document.pdf")
226 | if err != nil {
227 |     // Check error type
228 |     if errors.Is(err, extractous.ErrIO) {
229 |         log.Println("File I/O error")
230 |     } else if errors.Is(err, extractous.ErrExtraction) {
231 |         log.Println("Document extraction failed")
232 |     }
233 | 
234 |     log.Fatal(err)
235 | }
236 | ```
237 | 
238 | ### Error Handling with Debug Info
239 | 
240 | ```go
241 | content, metadata, err := extractor.ExtractFileToString("document.pdf")
242 | if err != nil {
243 |     // Get structured error information
244 |     var extractErr *extractous.ExtractError
245 |     if errors.As(err, &extractErr) {
246 |         fmt.Printf("Error code: %d\n", extractErr.Code)
247 |         fmt.Printf("Message: %s\n", extractErr.Message)
248 | 
249 |         // Optionally get detailed debug information
250 |         // (includes full error chain and backtrace if available)
251 |         if debug := extractErr.Debug(); debug != "" {
252 |             fmt.Printf("Debug info:\n%s\n", debug)
253 |         }
254 |     }
255 | }
256 | ```
257 | ---
258 | 
259 | ## Performance
260 | 
261 | | Operation          | Throughput (MB/s) | Memory (MB) | Accuracy (%) |
262 | | ------------------ | ----------------- | ----------- | ------------ |
263 | | String Extraction  | 36.70             | 15.78       | 86.95        |
264 | | Stream Extraction  | 14.16             | 21.83       | 87.74        |
265 | | Reference (Go PDF) | 79.38             | 44.67       | 82.02        |
266 | 
267 | ---
268 | 
269 | ## Supported Formats
270 | 
271 | Extractous Go supports PDF, Microsoft Office, OpenDocument, HTML/XML, plain text, images (with OCR) and more.
272 | 
273 | For the full list of supported formats, see [Apache Tika Supported Formats](https://tika.apache.org/2.0.0/formats.html).
274 | 
275 | ---
276 | 
277 | ## Requirements
278 | 
279 | ### Runtime Requirements
280 | 
281 | - Go 1.19 or later
282 | - CGO enabled (`CGO_ENABLED=1`)
283 | - Platform-specific native libraries (provided by installer)
284 | - **Tesseract OCR**: Required only for OCR functionality on images and scanned PDFs
285 |   - Ubuntu/Debian: `sudo apt-get install tesseract-ocr`
286 |   - macOS: `brew install tesseract`
287 |   - Windows: Download from [Tesseract at UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki)
288 | 
289 | ---
290 | 
291 | ## Distribution
292 | 
293 | When distributing applications built with extractous-go:
294 | 
295 | 1. **Bundle Native Libraries**: Include the platform-specific `.so`, `.dylib`, or `.dll` files with your application.
296 | 
297 | 2. **Set Library Search Path**:
298 |    - **Linux**: Set `LD_LIBRARY_PATH` or install to `/usr/local/lib`
299 |    - **macOS**: Set `DYLD_LIBRARY_PATH` or use `@rpath`
300 |    - **Windows**: Place DLL in the same directory as the executable or in `System32`
301 | 
302 | 3. **Cross-Platform Builds**: Download libraries for all target platforms using:
303 |    ```bash
304 |    go run github.com/rahulpoonia29/extractous-go/cmd/install@latest --all
305 |    ```
306 | 
307 | ---
308 | 
309 | ## Acknowledgments
310 | 
311 | - [Extractous](https://github.com/yobix-ai/extractous) - The underlying Rust library
312 | - [Apache Tika](https://tika.apache.org/) - Document extraction engine
313 | - [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) - OCR engine
314 | 


--------------------------------------------------------------------------------
/errors.go:
--------------------------------------------------------------------------------
  1 | package extractous
  2 | 
  3 | /*
  4 | #include <extractous.h>
  5 | #include <stdlib.h>
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"errors"
 10 | 	"fmt"
 11 | )
 12 | 
 13 | // Sentinel errors that can be checked with errors.Is.
 14 | //
 15 | // These errors represent common failure modes in document extraction. They can
 16 | // be used with errors.Is() and errors.As() for error handling and classification.
 17 | //
 18 | // # Error Handling Pattern
 19 | //
 20 | //	content, _, err := extractor.ExtractFileToString("document.pdf")
 21 | //	if err != nil {
 22 | //	    if errors.Is(err, extractous.ErrIO) {
 23 | //	        // File not found or not readable
 24 | //	        log.Printf("File error: %v", err)
 25 | //	    } else if errors.Is(err, extractous.ErrExtraction) {
 26 | //	        // Document format issue or corruption
 27 | //	        log.Printf("Extraction error: %v", err)
 28 | //	    } else {
 29 | //	        // Other error
 30 | //	        log.Printf("Unknown error: %v", err)
 31 | //	    }
 32 | //	    return
 33 | //	}
 34 | //
 35 | // # Error Unwrapping
 36 | //
 37 | // All errors returned by this package can be unwrapped to get the sentinel error:
 38 | //
 39 | //	var extractErr *extractous.ExtractError
 40 | //	if errors.As(err, &extractErr) {
 41 | //	    fmt.Printf("Error code: %d\n", extractErr.Code)
 42 | //	    fmt.Printf("Message: %s\n", extractErr.Message)
 43 | //	    fmt.Printf("Type: %v\n", errors.Unwrap(extractErr))
 44 | //	}
 45 | var (
 46 | 	// ErrNullPointer indicates a null pointer was encountered internally.
 47 | 	//
 48 | 	// This error typically indicates a programming error or corrupted internal
 49 | 	// state. It should not occur during normal operation.
 50 | 	//
 51 | 	// Common causes:
 52 | 	//   - Using an extractor after calling Close()
 53 | 	//   - Passing a nil configuration to a setter
 54 | 	//   - Internal FFI layer issues
 55 | 	//
 56 | 	// Example:
 57 | 	//
 58 | 	//	extractor := extractous.New()
 59 | 	//	extractor.Close()
 60 | 	//	_, _, err := extractor.ExtractFileToString("doc.pdf")
 61 | 	//	if errors.Is(err, extractous.ErrNullPointer) {
 62 | 	//	    // Extractor was already closed
 63 | 	//	}
 64 | 	ErrNullPointer = errors.New("null pointer provided")
 65 | 
 66 | 	// ErrInvalidUTF8 indicates a string contains invalid UTF-8 sequences.
 67 | 	//
 68 | 	// This can occur when:
 69 | 	//   - Extracting documents with corrupt character encoding
 70 | 	//   - Document claims to be UTF-8 but contains invalid sequences
 71 | 	//   - Character set mismatch between document and configuration
 72 | 	//
 73 | 	// Example handling:
 74 | 	//
 75 | 	//	if errors.Is(err, extractous.ErrInvalidUTF8) {
 76 | 	//	    // Try with a different encoding
 77 | 	//	    extractor = extractor.SetEncoding(extractous.CharSetUSASCII)
 78 | 	//	    content, _, err = extractor.ExtractFileToString(path)
 79 | 	//	}
 80 | 	ErrInvalidUTF8 = errors.New("invalid UTF-8 string")
 81 | 
 82 | 	// ErrInvalidString indicates string conversion or processing failed.
 83 | 	//
 84 | 	// This error occurs when:
 85 | 	//   - String parameters cannot be converted properly
 86 | 	//   - Extracted text contains characters that cannot be represented
 87 | 	//   - Internal string buffer operations fail
 88 | 	//
 89 | 	// This is less common than ErrInvalidUTF8 and typically indicates a more
 90 | 	// fundamental issue with the document or extraction process.
 91 | 	ErrInvalidString = errors.New("string conversion failed")
 92 | 
 93 | 	// ErrExtraction indicates document extraction failed.
 94 | 	//
 95 | 	// This is the most common error and can have many causes:
 96 | 	//   - Unsupported document format
 97 | 	//   - Corrupted or malformed document
 98 | 	//   - Document is encrypted or password-protected
 99 | 	//   - Document uses unsupported features
100 | 	//   - OCR processing failed
101 | 	//
102 | 	// The ExtractError.Message field usually contains specific details about
103 | 	// what went wrong.
104 | 	//
105 | 	// Example handling:
106 | 	//
107 | 	//	if errors.Is(err, extractous.ErrExtraction) {
108 | 	//	    var extractErr *extractous.ExtractError
109 | 	//	    if errors.As(err, &extractErr) {
110 | 	//	        log.Printf("Extraction failed: %s", extractErr.Message)
111 | 	//	        // Try alternative extraction method or skip document
112 | 	//	    }
113 | 	//	}
114 | 	ErrExtraction = errors.New("extraction failed")
115 | 
116 | 	// ErrIO indicates an I/O operation failed.
117 | 	//
118 | 	// Common causes:
119 | 	//   - File not found
120 | 	//   - Permission denied
121 | 	//   - Disk read/write error
122 | 	//   - Network error (for URL extraction)
123 | 	//   - Out of disk space
124 | 	//
125 | 	// Example handling:
126 | 	//
127 | 	//	if errors.Is(err, extractous.ErrIO) {
128 | 	//	    if os.IsNotExist(err) {
129 | 	//	        log.Println("File not found")
130 | 	//	    } else if os.IsPermission(err) {
131 | 	//	        log.Println("Permission denied")
132 | 	//	    } else {
133 | 	//	        log.Printf("I/O error: %v", err)
134 | 	//	    }
135 | 	//	}
136 | 	ErrIO = errors.New("IO error")
137 | 
138 | 	// ErrInvalidConfig indicates the provided configuration is invalid.
139 | 	//
140 | 	// This error occurs when:
141 | 	//   - Configuration parameter is out of valid range
142 | 	//   - Conflicting configuration options
143 | 	//   - Required configuration is missing
144 | 	//
145 | 	// Example:
146 | 	//
147 | 	//	config := extractous.NewOcrConfig().SetDensity(-100) // Invalid DPI
148 | 	//	extractor := extractous.New().SetOcrConfig(config)
149 | 	//	// May return ErrInvalidConfig when used
150 | 	ErrInvalidConfig = errors.New("invalid configuration")
151 | 
152 | 	// ErrInvalidEnum indicates an invalid enum value was provided.
153 | 	//
154 | 	// This typically indicates a programming error where an enum constant
155 | 	// is used incorrectly or has an unexpected value.
156 | 	//
157 | 	// Example:
158 | 	//
159 | 	//	strategy := extractous.PdfOcrStrategy(999) // Invalid value
160 | 	//	config := extractous.NewPdfConfig().SetOcrStrategy(strategy)
161 | 	//	// May return ErrInvalidEnum
162 | 	ErrInvalidEnum = errors.New("invalid enum value")
163 | )
164 | 
165 | // ExtractError wraps detailed extraction error information.
166 | //
167 | // ExtractError provides structured error information including an error code,
168 | // a human-readable message, and a sentinel error for classification. It
169 | // implements the error interface and supports error unwrapping for use with
170 | // errors.Is() and errors.As().
171 | //
172 | // # Fields
173 | //
174 | //   - Code: Numeric error code from the FFI layer (negative values)
175 | //   - Message: Detailed error message from the underlying extraction library
176 | //   - Err: Wrapped sentinel error (one of ErrNullPointer, ErrIO, etc.)
177 | //
178 | // # Usage
179 | //
180 | // Use errors.Is() to check error types:
181 | //
182 | //	if errors.Is(err, extractous.ErrExtraction) {
183 | //	    // Handle extraction error
184 | //	}
185 | //
186 | // Use errors.As() to access error details:
187 | //
188 | //	var extractErr *extractous.ExtractError
189 | //	if errors.As(err, &extractErr) {
190 | //	    fmt.Printf("Error code: %d\n", extractErr.Code)
191 | //	    fmt.Printf("Message: %s\n", extractErr.Message)
192 | //	}
193 | //
194 | // # Example
195 | //
196 | //	content, _, err := extractor.ExtractFileToString("corrupt.pdf")
197 | //	if err != nil {
198 | //	    var extractErr *extractous.ExtractError
199 | //	    if errors.As(err, &extractErr) {
200 | //	        switch {
201 | //	        case extractErr.Code == -4:
202 | //	            log.Printf("Extraction failed: %s", extractErr.Message)
203 | //	        case extractErr.Code == -5:
204 | //	            log.Printf("I/O error: %s", extractErr.Message)
205 | //	        default:
206 | //	            log.Printf("Error %d: %s", extractErr.Code, extractErr.Message)
207 | //	        }
208 | //	    }
209 | //	}
210 | // errors.go
211 | 
212 | // ExtractError wraps detailed extraction error information.
213 | type ExtractError struct {
214 |     Code    int    // Numeric error code from FFI layer
215 |     Message string // User-facing error message
216 |     Err     error  // Wrapped sentinel error for errors.Is()
217 | }
218 | 
219 | // newError creates an ExtractError from a C error code.
220 | // Debug details are NOT fetched here to avoid memory overhead.
221 | // Users must explicitly call Debug() to get detailed information.
222 | func newError(code C.int) error {
223 |     if code == errOK {
224 |         return nil
225 |     }
226 | 
227 |     var sentinelErr error
228 |     switch int(code) {
229 |     case errNullPointer:
230 |         sentinelErr = ErrNullPointer
231 |     case errInvalidUTF8:
232 |         sentinelErr = ErrInvalidUTF8
233 |     case errInvalidString:
234 |         sentinelErr = ErrInvalidString
235 |     case errExtractionFailed:
236 |         sentinelErr = ErrExtraction
237 |     case errIOError:
238 |         sentinelErr = ErrIO
239 |     case errInvalidConfig:
240 |         sentinelErr = ErrInvalidConfig
241 |     case errInvalidEnum:
242 |         sentinelErr = ErrInvalidEnum
243 |     default:
244 |         sentinelErr = fmt.Errorf("unknown error code: %d", code)
245 |     }
246 | 
247 |     // Get user-facing error message (fast, small string)
248 |     cMsg := C.extractous_error_message(code)
249 |     var msg string
250 |     if cMsg != nil {
251 |         msg = goString(cMsg)
252 |         C.extractous_string_free(cMsg)
253 |     }
254 | 
255 |     return &ExtractError{
256 |         Code:    int(code),
257 |         Message: msg,
258 |         Err:     sentinelErr,
259 |     }
260 | }
261 | 
262 | // Error implements the error interface.
263 | // Returns user-friendly error message.
264 | func (e *ExtractError) Error() string {
265 |     if e.Message != "" {
266 |         return fmt.Sprintf("extractous error (code %d): %s", e.Code, e.Message)
267 |     }
268 |     return fmt.Sprintf("extractous error (code %d)", e.Code)
269 | }
270 | 
271 | // Unwrap returns the underlying sentinel error for errors.Is() support
272 | func (e *ExtractError) Unwrap() error {
273 |     return e.Err
274 | }
275 | 
276 | // Debug retrieves detailed debug information for the last error
277 | // that occurred on the current thread.
278 | //
279 | // This function is EXPENSIVE - it formats the full error chain with
280 | // backtrace (if RUST_BACKTRACE=1). Only call it when you actually
281 | // need detailed debugging information.
282 | //
283 | // **Important**: This clears the stored error. Subsequent calls to
284 | // Debug() will return empty string unless a new error occurs.
285 | //
286 | // Example:
287 | //
288 | //  _, _, err := extractor.ExtractFileToString("corrupt.pdf")
289 | //  if err != nil {
290 | //      var extractErr *extractous.ExtractError
291 | //      if errors.As(err, &extractErr) {
292 | //          // Show user-facing error
293 | //          fmt.Printf("Error: %s\n", extractErr.Error())
294 | //
295 | //          // Optionally get debug details (for developers only)
296 | //          if debug := extractErr.Debug(); debug != "" {
297 | //              log.Printf("DEBUG:\n%s", debug)
298 | //          }
299 | //      }
300 | //  }
301 | func (e *ExtractError) Debug() string {
302 |     cDebug := C.extractous_error_get_last_debug()
303 |     if cDebug == nil {
304 |         return ""
305 |     }
306 |     defer C.extractous_string_free(cDebug)
307 |     return goString(cDebug)
308 | }
309 | 
310 | // HasDebug checks if debug information is available for the last error
311 | // on the current thread without retrieving it.
312 | //
313 | // This is useful to avoid the overhead of Debug() when no error is stored.
314 | func (e *ExtractError) HasDebug() bool {
315 |     return C.extractous_error_has_debug() != 0
316 | }
317 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/types.go:
--------------------------------------------------------------------------------
  1 | package extractous
  2 | 
  3 | // CharSet represents character encoding for text extraction.
  4 | //
  5 | // Character sets determine how bytes are interpreted as text characters. Most
  6 | // modern documents use UTF-8, which supports all Unicode characters. Other
  7 | // encodings are provided for legacy compatibility.
  8 | //
  9 | // # Supported Encodings
 10 | //
 11 | //   - CharSetUTF8: Unicode UTF-8 (default, recommended for all modern uses)
 12 | //   - CharSetUSASCII: US-ASCII (7-bit, legacy systems only)
 13 | //   - CharSetUTF16BE: UTF-16 Big Endian (some legacy systems)
 14 | //
 15 | // # When to Use Different Encodings
 16 | //
 17 | // UTF-8 (default): Use for all modern applications. It's the universal standard
 18 | // and supports all languages and symbols.
 19 | //
 20 | // US-ASCII: Only for legacy systems that cannot handle Unicode. This encoding
 21 | // only supports basic English characters (a-z, A-Z, 0-9, and basic punctuation).
 22 | // Any characters outside this range will be lost or corrupted.
 23 | //
 24 | // UTF-16BE: Rare. Only needed for specific legacy systems that require this
 25 | // encoding. Modern systems should use UTF-8.
 26 | //
 27 | // Example:
 28 | //
 29 | //	// Default UTF-8 (recommended)
 30 | //	extractor := extractous.New()
 31 | //
 32 | //	// Explicitly set UTF-8
 33 | //	extractor := extractous.New().
 34 | //	    SetEncoding(extractous.CharSetUTF8)
 35 | //
 36 | //	// Legacy ASCII (not recommended)
 37 | //	extractor := extractous.New().
 38 | //	    SetEncoding(extractous.CharSetUSASCII)
 39 | type CharSet int
 40 | 
 41 | const (
 42 | 	// CharSetUTF8 is UTF-8 encoding (default, recommended).
 43 | 	//
 44 | 	// UTF-8 is the universal standard character encoding that supports all
 45 | 	// Unicode characters, including:
 46 | 	//   - All human languages (Latin, Cyrillic, Arabic, CJK, etc.)
 47 | 	//   - Emojis and symbols
 48 | 	//   - Mathematical notation
 49 | 	//   - Technical symbols
 50 | 	//
 51 | 	// UTF-8 is backward compatible with ASCII, space-efficient, and the
 52 | 	// default encoding for modern systems.
 53 | 	//
 54 | 	// Use this for all new applications unless you have a specific requirement
 55 | 	// for another encoding.
 56 | 	CharSetUTF8 CharSet = 0
 57 | 
 58 | 	// CharSetUSASCII is US-ASCII encoding (7-bit, legacy only).
 59 | 	//
 60 | 	// US-ASCII supports only 128 characters:
 61 | 	//   - Uppercase letters: A-Z
 62 | 	//   - Lowercase letters: a-z
 63 | 	//   - Digits: 0-9
 64 | 	//   - Basic punctuation and symbols
 65 | 	//
 66 | 	// Characters outside this range (accented letters, non-Latin scripts, etc.)
 67 | 	// will be lost or converted to placeholders.
 68 | 	//
 69 | 	// Only use this encoding if:
 70 | 	//   - You need compatibility with very old systems
 71 | 	//   - Your documents contain only basic English text
 72 | 	//   - You have explicit requirements for ASCII-only output
 73 | 	//
 74 | 	// Warning: This encoding cannot represent most international text.
 75 | 	CharSetUSASCII CharSet = 1
 76 | 
 77 | 	// CharSetUTF16BE is UTF-16 Big Endian encoding (legacy).
 78 | 	//
 79 | 	// UTF-16 uses 16-bit code units and can represent all Unicode characters.
 80 | 	// The "Big Endian" variant stores the most significant byte first.
 81 | 	//
 82 | 	// This encoding is less space-efficient than UTF-8 for most text and is
 83 | 	// primarily used for:
 84 | 	//   - Windows internal APIs (UTF-16LE, not UTF-16BE)
 85 | 	//   - Java internal string representation
 86 | 	//   - Some legacy systems
 87 | 	//
 88 | 	// Modern systems should use UTF-8 instead. Only use UTF-16BE if you have
 89 | 	// explicit requirements for this encoding.
 90 | 	CharSetUTF16BE CharSet = 2
 91 | )
 92 | 
 93 | // String returns the human-readable name of the character set.
 94 | //
 95 | // This is useful for logging, debugging, and displaying the current encoding
 96 | // configuration to users.
 97 | //
 98 | // Example:
 99 | //
100 | //	charset := extractous.CharSetUTF8
101 | //	fmt.Println(charset.String()) // Output: "UTF-8"
102 | func (c CharSet) String() string {
103 | 	switch c {
104 | 	case CharSetUTF8:
105 | 		return "UTF-8"
106 | 	case CharSetUSASCII:
107 | 		return "US-ASCII"
108 | 	case CharSetUTF16BE:
109 | 		return "UTF-16BE"
110 | 	default:
111 | 		return "Unknown"
112 | 	}
113 | }
114 | 
115 | // PdfOcrStrategy defines how OCR is applied to PDF documents.
116 | //
117 | // PDF documents can contain different types of text content:
118 | //   - Embedded text: Text that is directly stored in the PDF (selectable text)
119 | //   - Image-based text: Text visible only as pixels in images (not selectable)
120 | //
121 | // The OCR strategy determines how the extractor handles these different types
122 | // of content.
123 | //
124 | // # Strategy Comparison
125 | //
126 | // PdfOcrNoOcr (fastest):
127 | //   - Only extracts embedded text
128 | //   - No OCR processing
129 | //   - Fast and efficient
130 | //   - Good for: PDFs with selectable text, e-books, digital documents
131 | //   - Bad for: Scanned documents, photos of text
132 | //
133 | // PdfOcrAuto (recommended):
134 | //   - Automatically detects pages without embedded text
135 | //   - Performs OCR only on those pages
136 | //   - Balanced performance and accuracy
137 | //   - Good for: Mixed documents, unknown document types
138 | //   - The smart default for most use cases
139 | //
140 | // PdfOcrOcrOnly (specialized):
141 | //   - Only performs OCR, ignores embedded text
142 | //   - Useful when embedded text is corrupt or incorrect
143 | //   - Slow, processes every page with OCR
144 | //   - Good for: PDFs with broken text layers
145 | //   - Bad for: General purpose extraction
146 | //
147 | // PdfOcrOcrAndTextExtraction (comprehensive):
148 | //   - Extracts both embedded text AND performs OCR
149 | //   - Most comprehensive but slowest
150 | //   - May produce duplicate content
151 | //   - Good for: Maximum content extraction, forensic analysis
152 | //   - Bad for: Production systems (very slow)
153 | //
154 | // # Performance Implications
155 | //
156 | // OCR is computationally expensive:
157 | //   - PdfOcrNoOcr: ~100-1000x faster than OCR strategies
158 | //   - PdfOcrAuto: Variable (depends on document content)
159 | //   - PdfOcrOcrOnly: Slowest, processes every page
160 | //   - PdfOcrOcrAndTextExtraction: Slowest, maximum processing
161 | //
162 | // Example:
163 | //
164 | //	// Digital PDF with embedded text (fast)
165 | //	pdfConfig := extractous.NewPdfConfig().
166 | //	    SetOcrStrategy(extractous.PdfOcrNoOcr)
167 | //
168 | //	// Scanned document (auto OCR when needed)
169 | //	pdfConfig := extractous.NewPdfConfig().
170 | //	    SetOcrStrategy(extractous.PdfOcrAuto)
171 | //
172 | //	// Force OCR on all pages (slow but comprehensive)
173 | //	pdfConfig := extractous.NewPdfConfig().
174 | //	    SetOcrStrategy(extractous.PdfOcrOcrAndTextExtraction)
175 | type PdfOcrStrategy int
176 | 
177 | const (
178 | 	// PdfOcrNoOcr extracts only embedded text, no OCR processing (fastest).
179 | 	//
180 | 	// This strategy only extracts selectable text that is directly embedded in
181 | 	// the PDF. It does NOT perform OCR on images or scanned pages.
182 | 	//
183 | 	// Use when:
184 | 	//   - You know the PDFs contain selectable text
185 | 	//   - Performance is critical
186 | 	//   - Processing digital documents (not scans)
187 | 	//
188 | 	// This is the fastest strategy, typically 100-1000x faster than OCR-based
189 | 	// strategies.
190 | 	//
191 | 	// Example:
192 | 	//
193 | 	//	// Fast extraction for digital PDFs
194 | 	//	config := extractous.NewPdfConfig().
195 | 	//	    SetOcrStrategy(extractous.PdfOcrNoOcr)
196 | 	//
197 | 	// Note: Scanned documents and images will produce little or no text with
198 | 	// this strategy.
199 | 	PdfOcrNoOcr PdfOcrStrategy = 0
200 | 
201 | 	// PdfOcrOcrOnly performs OCR on all pages, ignores embedded text.
202 | 	//
203 | 	// This strategy applies OCR to all pages regardless of whether they contain
204 | 	// embedded text. It's useful when the embedded text is corrupt, incorrect,
205 | 	// or lower quality than what OCR would produce.
206 | 	//
207 | 	// Use when:
208 | 	//   - The PDF has a broken or incorrect text layer
209 | 	//   - You want consistent OCR output across all pages
210 | 	//   - You need to extract from pure image PDFs
211 | 	//
212 | 	// Warning: This is very slow as it performs OCR on every page, even pages
213 | 	// that already have good embedded text.
214 | 	//
215 | 	// Example:
216 | 	//
217 | 	//	// Force OCR for PDFs with broken text layers
218 | 	//	config := extractous.NewPdfConfig().
219 | 	//	    SetOcrStrategy(extractous.PdfOcrOcrOnly)
220 | 	//
221 | 	// Note: Requires Tesseract OCR to be installed on the system.
222 | 	PdfOcrOcrOnly PdfOcrStrategy = 1
223 | 
224 | 	// PdfOcrOcrAndTextExtraction extracts both embedded text AND performs OCR.
225 | 	//
226 | 	// This strategy is the most comprehensive, extracting both:
227 | 	//   1. Embedded text from the PDF text layer
228 | 	//   2. Text via OCR from images and visual content
229 | 	//
230 | 	// This can produce duplicate content if the same text appears both as
231 | 	// embedded text and in images.
232 | 	//
233 | 	// Use when:
234 | 	//   - Maximum content extraction is required
235 | 	//   - You need both text layers for comparison
236 | 	//   - Forensic analysis or complete document preservation
237 | 	//
238 | 	// Warning: This is the slowest strategy, combining all extraction methods.
239 | 	// It may also produce duplicate or redundant content.
240 | 	//
241 | 	// Example:
242 | 	//
243 | 	//	// Maximum extraction for forensic analysis
244 | 	//	config := extractous.NewPdfConfig().
245 | 	//	    SetOcrStrategy(extractous.PdfOcrOcrAndTextExtraction)
246 | 	//
247 | 	// Note: Best for offline processing where completeness matters more than
248 | 	// speed or deduplication.
249 | 	PdfOcrOcrAndTextExtraction PdfOcrStrategy = 2
250 | 
251 | 	// PdfOcrAuto automatically decides based on page content (recommended).
252 | 	//
253 | 	// This strategy intelligently detects whether each page has embedded text:
254 | 	//   - Pages WITH embedded text: Extract directly (fast)
255 | 	//   - Pages WITHOUT embedded text: Apply OCR (slower)
256 | 	//
257 | 	// This provides the best balance of performance and completeness for
258 | 	// documents of unknown type or mixed content.
259 | 	//
260 | 	// Use when:
261 | 	//   - Document type is unknown
262 | 	//   - Handling mixed documents (some pages scanned, some digital)
263 | 	//   - You want good default behavior
264 | 	//
265 | 	// This is the recommended strategy for general-purpose PDF extraction.
266 | 	//
267 | 	// Example:
268 | 	//
269 | 	//	// Smart extraction that adapts to content
270 | 	//	config := extractous.NewPdfConfig().
271 | 	//	    SetOcrStrategy(extractous.PdfOcrAuto)
272 | 	//
273 | 	// Performance: Variable depending on content. Pages with embedded text are
274 | 	// processed quickly; only scanned pages incur OCR overhead.
275 | 	PdfOcrAuto PdfOcrStrategy = 3
276 | )
277 | 
278 | // String returns the human-readable name of the OCR strategy.
279 | //
280 | // This is useful for logging, debugging, and displaying the current OCR
281 | // configuration to users.
282 | //
283 | // Example:
284 | //
285 | //	strategy := extractous.PdfOcrAuto
286 | //	fmt.Println(strategy.String()) // Output: "Auto"
287 | func (s PdfOcrStrategy) String() string {
288 | 	switch s {
289 | 	case PdfOcrNoOcr:
290 | 		return "NoOCR"
291 | 	case PdfOcrOcrOnly:
292 | 		return "OCROnly"
293 | 	case PdfOcrOcrAndTextExtraction:
294 | 		return "OCRAndTextExtraction"
295 | 	case PdfOcrAuto:
296 | 		return "Auto"
297 | 	default:
298 | 		return "Unknown"
299 | 	}
300 | }
301 | 
302 | // Error codes from the FFI layer.
303 | //
304 | // These constants map to error codes returned by the underlying C FFI library.
305 | // They are used internally to construct Go error values. Application code should
306 | // use the exported error variables (ErrNullPointer, ErrIO, etc.) instead of
307 | // checking these raw codes.
308 | //
309 | // Internal use only.
310 | const (
311 | 	errOK               = 0  // No error
312 | 	errNullPointer      = -1 // Null pointer passed to FFI
313 | 	errInvalidUTF8      = -2 // String is not valid UTF-8
314 | 	errInvalidString    = -3 // String parameter is invalid
315 | 	errExtractionFailed = -4 // Document extraction failed
316 | 	errIOError          = -5 // File I/O error
317 | 	errInvalidConfig    = -6 // Configuration is invalid
318 | 	errInvalidEnum      = -7 // Enum value is invalid
319 | )
320 | 


--------------------------------------------------------------------------------
/ffi/src/extractor.rs:
--------------------------------------------------------------------------------
  1 | use crate::ecore::{CharSet, Extractor as CoreExtractor};
  2 | use crate::errors::*;
  3 | use crate::metadata::metadata_to_c;
  4 | use crate::types::*;
  5 | use std::ffi::{CStr, CString};
  6 | use std::os::raw::c_char;
  7 | use std::ptr;
  8 | 
  9 | /// Creates a new `Extractor` with a default configuration.
 10 | /// The returned handle must be freed with `extractous_extractor_free`.
 11 | // #[must_use]
 12 | #[unsafe(no_mangle)]
 13 | pub extern "C" fn extractous_extractor_new() -> *mut CExtractor {
 14 |     let extractor = Box::new(CoreExtractor::new());
 15 |     Box::into_raw(extractor) as *mut CExtractor
 16 | }
 17 | 
 18 | /// Frees the memory associated with an `Extractor` handle.
 19 | #[unsafe(no_mangle)]
 20 | pub unsafe extern "C" fn extractous_extractor_free(handle: *mut CExtractor) {
 21 |     if !handle.is_null() {
 22 |         unsafe {
 23 |             drop(Box::from_raw(handle as *mut CoreExtractor));
 24 |         }
 25 |     }
 26 | }
 27 | 
 28 | /// A macro to safely update an Extractor instance behind a raw pointer.
 29 | macro_rules! update_extractor {
 30 |     ($handle:expr, |$extractor_val:ident| $body:block) => {
 31 |         if $handle.is_null() {
 32 |             return;
 33 |         }
 34 |         unsafe {
 35 |             let extractor_ptr = $handle as *mut CoreExtractor;
 36 |             let old_extractor = ptr::read(extractor_ptr);
 37 |             let new_extractor = {
 38 |                 let $extractor_val = old_extractor;
 39 |                 $body
 40 |             };
 41 |             ptr::write(extractor_ptr, new_extractor);
 42 |         }
 43 |     };
 44 | }
 45 | 
 46 | /// Sets the maximum length for extracted string content.
 47 | #[unsafe(no_mangle)]
 48 | pub unsafe extern "C" fn extractous_extractor_set_extract_string_max_length_mut(
 49 |     handle: *mut CExtractor,
 50 |     max_length: libc::c_int,
 51 | ) {
 52 |     update_extractor!(handle, |extractor| {
 53 |         extractor.set_extract_string_max_length(max_length as i32)
 54 |     });
 55 | }
 56 | 
 57 | /// Sets the character encoding for the extracted text.
 58 | #[unsafe(no_mangle)]
 59 | pub unsafe extern "C" fn extractous_extractor_set_encoding_mut(
 60 |     handle: *mut CExtractor,
 61 |     encoding: libc::c_int,
 62 | ) {
 63 |     update_extractor!(handle, |extractor| {
 64 |         let charset = match encoding {
 65 |             CHARSET_UTF_8 => CharSet::UTF_8,
 66 |             CHARSET_US_ASCII => CharSet::US_ASCII,
 67 |             CHARSET_UTF_16BE => CharSet::UTF_16BE,
 68 |             _ => return,
 69 |         };
 70 |         extractor.set_encoding(charset)
 71 |     });
 72 | }
 73 | 
 74 | /// Sets the configuration for the PDF parser.
 75 | #[unsafe(no_mangle)]
 76 | pub unsafe extern "C" fn extractous_extractor_set_pdf_config_mut(
 77 |     handle: *mut CExtractor,
 78 |     config: *const CPdfParserConfig,
 79 | ) {
 80 |     if config.is_null() {
 81 |         return;
 82 |     }
 83 |     update_extractor!(handle, |extractor| {
 84 |         let pdf_config = &*(config as *const crate::ecore::PdfParserConfig);
 85 |         extractor.set_pdf_config(pdf_config.clone())
 86 |     });
 87 | }
 88 | 
 89 | /// Sets the configuration for the Office document parser.
 90 | #[unsafe(no_mangle)]
 91 | pub unsafe extern "C" fn extractous_extractor_set_office_config_mut(
 92 |     handle: *mut CExtractor,
 93 |     config: *const COfficeParserConfig,
 94 | ) {
 95 |     if config.is_null() {
 96 |         return;
 97 |     }
 98 |     update_extractor!(handle, |extractor| {
 99 |         let office_config = &*(config as *const crate::ecore::OfficeParserConfig);
100 |         extractor.set_office_config(office_config.clone())
101 |     });
102 | }
103 | 
104 | /// Sets the configuration for Tesseract OCR.
105 | #[unsafe(no_mangle)]
106 | pub unsafe extern "C" fn extractous_extractor_set_ocr_config_mut(
107 |     handle: *mut CExtractor,
108 |     config: *const CTesseractOcrConfig,
109 | ) {
110 |     if config.is_null() {
111 |         return;
112 |     }
113 |     update_extractor!(handle, |extractor| {
114 |         let ocr_config = &*(config as *const crate::ecore::TesseractOcrConfig);
115 |         extractor.set_ocr_config(ocr_config.clone())
116 |     });
117 | }
118 | 
119 | /// Sets whether to output structured XML instead of plain text.
120 | #[unsafe(no_mangle)]
121 | pub unsafe extern "C" fn extractous_extractor_set_xml_output_mut(
122 |     handle: *mut CExtractor,
123 |     xml_output: bool,
124 | ) {
125 |     update_extractor!(handle, |extractor| { extractor.set_xml_output(xml_output) });
126 | }
127 | 
128 | // Macro to handle the common extraction logic and error wrapping.
129 | macro_rules! perform_extraction {
130 |     (
131 |         $handle:expr,
132 |         $out_ptr1:expr,
133 |         $out_ptr2:expr,
134 |         $extractor_call:expr,
135 |         $success_handler:expr
136 |     ) => {{
137 |         if $handle.is_null() || $out_ptr1.is_null() || $out_ptr2.is_null() {
138 |             return ERR_NULL_POINTER;
139 |         }
140 | 
141 |         // Safely get a shared reference to the extractor.
142 |         let extractor = unsafe { &*($handle as *const CoreExtractor) };
143 | 
144 |         match $extractor_call(extractor) {
145 |             Ok((res1, res2)) => {
146 |                 $success_handler($out_ptr1, $out_ptr2, res1, res2);
147 |                 ERR_OK
148 |             }
149 |             Err(e) => {
150 |                 let code = extractous_error_to_code(&e);
151 |                 set_last_error(e);
152 |                 code
153 |             }
154 |         }
155 |     }};
156 | }
157 | 
158 | /// Extracts content and metadata from a local file path into a string.
159 | ///
160 | /// Output strings must be freed with `extractous_string_free`.
161 | /// Output metadata must be freed with `extractous_metadata_free`.
162 | #[unsafe(no_mangle)]
163 | pub unsafe extern "C" fn extractous_extractor_extract_file_to_string(
164 |     handle: *mut CExtractor,
165 |     path: *const c_char,
166 |     out_content: *mut *mut c_char,
167 |     out_metadata: *mut *mut CMetadata,
168 | ) -> libc::c_int {
169 |     if path.is_null() {
170 |         return ERR_NULL_POINTER;
171 |     }
172 |     let path_str = match unsafe { CStr::from_ptr(path).to_str() } {
173 |         Ok(s) => s,
174 |         Err(_) => return ERR_INVALID_UTF8,
175 |     };
176 | 
177 |     perform_extraction!(
178 |         handle,
179 |         out_content,
180 |         out_metadata,
181 |         |extractor: &CoreExtractor| extractor.extract_file_to_string(path_str),
182 |         |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| {
183 |             unsafe {
184 |                 *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw());
185 |                 *out_m = metadata_to_c(metadata);
186 |             }
187 |         }
188 |     )
189 | }
190 | 
191 | /// Extracts content and metadata from a local file path into a stream.
192 | #[unsafe(no_mangle)]
193 | pub unsafe extern "C" fn extractous_extractor_extract_file(
194 |     handle: *mut CExtractor,
195 |     path: *const c_char,
196 |     out_reader: *mut *mut CStreamReader,
197 |     out_metadata: *mut *mut CMetadata,
198 | ) -> libc::c_int {
199 |     if path.is_null() {
200 |         return ERR_NULL_POINTER;
201 |     }
202 |     let path_str = match unsafe { CStr::from_ptr(path).to_str() } {
203 |         Ok(s) => s,
204 |         Err(_) => return ERR_INVALID_UTF8,
205 |     };
206 | 
207 |     perform_extraction!(
208 |         handle,
209 |         out_reader,
210 |         out_metadata,
211 |         |extractor: &CoreExtractor| extractor.extract_file(path_str),
212 |         |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| {
213 |             unsafe {
214 |                 *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader;
215 |                 *out_m = metadata_to_c(metadata);
216 |             }
217 |         }
218 |     )
219 | }
220 | 
221 | /// Extracts content and metadata from a byte slice into a string.
222 | #[unsafe(no_mangle)]
223 | pub unsafe extern "C" fn extractous_extractor_extract_bytes_to_string(
224 |     handle: *mut CExtractor,
225 |     data: *const u8,
226 |     data_len: libc::size_t,
227 |     out_content: *mut *mut c_char,
228 |     out_metadata: *mut *mut CMetadata,
229 | ) -> libc::c_int {
230 |     if data.is_null() {
231 |         return ERR_NULL_POINTER;
232 |     }
233 |     let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
234 | 
235 |     perform_extraction!(
236 |         handle,
237 |         out_content,
238 |         out_metadata,
239 |         |extractor: &CoreExtractor| extractor.extract_bytes_to_string(bytes),
240 |         |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| {
241 |             unsafe {
242 |                 *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw());
243 |                 *out_m = metadata_to_c(metadata);
244 |             }
245 |         }
246 |     )
247 | }
248 | 
249 | /// Extracts content and metadata from a byte slice into a stream.
250 | #[unsafe(no_mangle)]
251 | pub unsafe extern "C" fn extractous_extractor_extract_bytes(
252 |     handle: *mut CExtractor,
253 |     data: *const u8,
254 |     data_len: libc::size_t,
255 |     out_reader: *mut *mut CStreamReader,
256 |     out_metadata: *mut *mut CMetadata,
257 | ) -> libc::c_int {
258 |     if data.is_null() {
259 |         return ERR_NULL_POINTER;
260 |     }
261 |     let bytes = unsafe { std::slice::from_raw_parts(data, data_len) };
262 | 
263 |     perform_extraction!(
264 |         handle,
265 |         out_reader,
266 |         out_metadata,
267 |         |extractor: &CoreExtractor| extractor.extract_bytes(bytes),
268 |         |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| {
269 |             unsafe {
270 |                 *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader;
271 |                 *out_m = metadata_to_c(metadata);
272 |             }
273 |         }
274 |     )
275 | }
276 | 
277 | /// Extracts content and metadata from a URL into a string.
278 | #[unsafe(no_mangle)]
279 | pub unsafe extern "C" fn extractous_extractor_extract_url_to_string(
280 |     handle: *mut CExtractor,
281 |     url: *const c_char,
282 |     out_content: *mut *mut c_char,
283 |     out_metadata: *mut *mut CMetadata,
284 | ) -> libc::c_int {
285 |     if url.is_null() {
286 |         return ERR_NULL_POINTER;
287 |     }
288 |     let url_str = match unsafe { CStr::from_ptr(url).to_str() } {
289 |         Ok(s) => s,
290 |         Err(_) => return ERR_INVALID_UTF8,
291 |     };
292 | 
293 |     perform_extraction!(
294 |         handle,
295 |         out_content,
296 |         out_metadata,
297 |         |extractor: &CoreExtractor| extractor.extract_url_to_string(url_str),
298 |         |out_c: *mut *mut c_char, out_m: *mut *mut CMetadata, content, metadata| {
299 |             unsafe {
300 |                 *out_c = CString::new(content).map_or(ptr::null_mut(), |s| s.into_raw());
301 |                 *out_m = metadata_to_c(metadata);
302 |             }
303 |         }
304 |     )
305 | }
306 | 
307 | /// Extracts content and metadata from a URL into a stream.
308 | #[unsafe(no_mangle)]
309 | pub unsafe extern "C" fn extractous_extractor_extract_url(
310 |     handle: *mut CExtractor,
311 |     url: *const c_char,
312 |     out_reader: *mut *mut CStreamReader,
313 |     out_metadata: *mut *mut CMetadata,
314 | ) -> libc::c_int {
315 |     if url.is_null() {
316 |         return ERR_NULL_POINTER;
317 |     }
318 |     let url_str = match unsafe { CStr::from_ptr(url).to_str() } {
319 |         Ok(s) => s,
320 |         Err(_) => return ERR_INVALID_UTF8,
321 |     };
322 | 
323 |     perform_extraction!(
324 |         handle,
325 |         out_reader,
326 |         out_metadata,
327 |         |extractor: &CoreExtractor| extractor.extract_url(url_str),
328 |         |out_r: *mut *mut CStreamReader, out_m: *mut *mut CMetadata, reader, metadata| {
329 |             unsafe {
330 |                 *out_r = Box::into_raw(Box::new(reader)) as *mut CStreamReader;
331 |                 *out_m = metadata_to_c(metadata);
332 |             }
333 |         }
334 |     )
335 | }
336 | 
337 | /// Frees a C-style string that was allocated by this library.
338 | #[unsafe(no_mangle)]
339 | pub unsafe extern "C" fn extractous_string_free(s: *mut c_char) {
340 |     if !s.is_null() {
341 |         drop(unsafe { CString::from_raw(s) });
342 |     }
343 | }
344 | 


--------------------------------------------------------------------------------
/tests/go/integration_test.go:
--------------------------------------------------------------------------------
  1 | package extractous_test
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"path/filepath"
  6 | 	"strings"
  7 | 	"testing"
  8 | 
  9 | 	extractous "github.com/rahulpoonia29/extractous-go"
 10 | )
 11 | 
 12 | // ============================================================================
 13 | // Test Setup
 14 | // ============================================================================
 15 | 
 16 | const (
 17 | 	testDataDir = "../testdata"
 18 | )
 19 | 
 20 | func setupTestDir(t *testing.T) string {
 21 | 	dir := filepath.Join(testDataDir)
 22 | 	if err := os.MkdirAll(dir, 0755); err != nil {
 23 | 		t.Fatalf("Failed to create test data directory: %v", err)
 24 | 	}
 25 | 	return dir
 26 | }
 27 | 
 28 | func createTestFile(t *testing.T, filename, content string) string {
 29 | 	dir := setupTestDir(t)
 30 | 	filePath := filepath.Join(dir, filename)
 31 | 
 32 | 	if err := os.WriteFile(filePath, []byte(content), 0644); err != nil {
 33 | 		t.Fatalf("Failed to create test file: %v", err)
 34 | 	}
 35 | 
 36 | 	return filePath
 37 | }
 38 | 
 39 | // ============================================================================
 40 | // Text File Tests
 41 | // ============================================================================
 42 | 
 43 | func TestIntegration_ExtractPlainText(t *testing.T) {
 44 | 	content := "Hello, World!\nThis is a test file."
 45 | 	filePath := createTestFile(t, "test.txt", content)
 46 | 	defer os.Remove(filePath)
 47 | 
 48 | 	extractor := extractous.New()
 49 | 	if extractor == nil {
 50 | 		t.Fatal("Failed to create extractor")
 51 | 	}
 52 | 	defer extractor.Close()
 53 | 
 54 | 	extracted, metadata, err := extractor.ExtractFileToString(filePath)
 55 | 	if err != nil {
 56 | 		t.Fatalf("ExtractFileToString failed: %v", err)
 57 | 	}
 58 | 
 59 | 	if !strings.Contains(extracted, "Hello, World!") {
 60 | 		t.Errorf("Expected content not found in extracted text: %s", extracted)
 61 | 	}
 62 | 
 63 | 	if !strings.Contains(extracted, "This is a test file") {
 64 | 		t.Errorf("Expected content not found in extracted text: %s", extracted)
 65 | 	}
 66 | 
 67 | 	if metadata == nil {
 68 | 		t.Error("Expected non-nil metadata")
 69 | 	}
 70 | 
 71 | 	// Check for common metadata
 72 | 	if contentType := metadata.Get("Content-Type"); contentType == "" {
 73 | 		t.Log("Warning: Content-Type not found in metadata")
 74 | 	}
 75 | }
 76 | 
 77 | func TestIntegration_ExtractBytes(t *testing.T) {
 78 | 	content := "Test content for bytes extraction"
 79 | 
 80 | 	extractor := extractous.New()
 81 | 	if extractor == nil {
 82 | 		t.Fatal("Failed to create extractor")
 83 | 	}
 84 | 	defer extractor.Close()
 85 | 
 86 | 	extracted, metadata, err := extractor.ExtractBytesToString([]byte(content))
 87 | 	if err != nil {
 88 | 		t.Fatalf("ExtractBytesToString failed: %v", err)
 89 | 	}
 90 | 
 91 | 	if !strings.Contains(extracted, content) {
 92 | 		t.Errorf("Expected content not found. Got: %s", extracted)
 93 | 	}
 94 | 
 95 | 	if metadata == nil {
 96 | 		t.Error("Expected non-nil metadata")
 97 | 	}
 98 | }
 99 | 
100 | func TestIntegration_ExtractBytesStream(t *testing.T) {
101 | 	content := "Test content for streaming bytes extraction"
102 | 
103 | 	extractor := extractous.New()
104 | 	if extractor == nil {
105 | 		t.Fatal("Failed to create extractor")
106 | 	}
107 | 	defer extractor.Close()
108 | 
109 | 	stream, metadata, err := extractor.ExtractBytes([]byte(content))
110 | 	if err != nil {
111 | 		t.Fatalf("ExtractBytes failed: %v", err)
112 | 	}
113 | 
114 | 	if stream == nil {
115 | 		t.Fatal("Expected non-nil stream")
116 | 	}
117 | 
118 | 	if metadata == nil {
119 | 		t.Error("Expected non-nil metadata")
120 | 	}
121 | 
122 | 	// Read content from stream
123 | 	extractedBytes := make([]byte, 1024)
124 | 	n, _ := stream.Read(extractedBytes)
125 | 
126 | 	extracted := string(extractedBytes[:n])
127 | 	if !strings.Contains(extracted, content) {
128 | 		t.Errorf("Expected content not found in stream. Got: %s", extracted)
129 | 	}
130 | }
131 | 
132 | // ============================================================================
133 | // Configuration Tests
134 | // ============================================================================
135 | 
136 | func TestIntegration_MaxLengthConfiguration(t *testing.T) {
137 | 	// Create a long text file
138 | 	longContent := strings.Repeat("A", 10000)
139 | 	filePath := createTestFile(t, "long_test.txt", longContent)
140 | 	defer os.Remove(filePath)
141 | 
142 | 	// Extract with small max length
143 | 	extractor := extractous.New().SetExtractStringMaxLength(100)
144 | 	if extractor == nil {
145 | 		t.Fatal("Failed to create extractor")
146 | 	}
147 | 	defer extractor.Close()
148 | 
149 | 	extracted, _, err := extractor.ExtractFileToString(filePath)
150 | 	if err != nil {
151 | 		t.Fatalf("ExtractFileToString failed: %v", err)
152 | 	}
153 | 
154 | 	// Extracted content should be truncated (or close to max length)
155 | 	if len(extracted) > 200 { // Some overhead is allowed
156 | 		t.Logf("Warning: Extracted %d chars, expected ~100", len(extracted))
157 | 	}
158 | }
159 | 
160 | func TestIntegration_EncodingConfiguration(t *testing.T) {
161 | 	content := "UTF-8 content: こんにちは"
162 | 	filePath := createTestFile(t, "utf8_test.txt", content)
163 | 	defer os.Remove(filePath)
164 | 
165 | 	extractor := extractous.New().SetEncoding(extractous.CharSetUTF8)
166 | 	if extractor == nil {
167 | 		t.Fatal("Failed to create extractor")
168 | 	}
169 | 	defer extractor.Close()
170 | 
171 | 	extracted, _, err := extractor.ExtractFileToString(filePath)
172 | 	if err != nil {
173 | 		t.Fatalf("ExtractFileToString failed: %v", err)
174 | 	}
175 | 
176 | 	if !strings.Contains(extracted, "UTF-8") {
177 | 		t.Logf("Extracted content: %s", extracted)
178 | 	}
179 | }
180 | 
181 | func TestIntegration_XmlOutputConfiguration(t *testing.T) {
182 | 	content := "Test content for XML output"
183 | 	filePath := createTestFile(t, "xml_test.txt", content)
184 | 	defer os.Remove(filePath)
185 | 
186 | 	// Test with XML output enabled
187 | 	extractor := extractous.New().SetXmlOutput(true)
188 | 	if extractor == nil {
189 | 		t.Fatal("Failed to create extractor")
190 | 	}
191 | 	defer extractor.Close()
192 | 
193 | 	extracted, _, err := extractor.ExtractFileToString(filePath)
194 | 	if err != nil {
195 | 		t.Fatalf("ExtractFileToString failed: %v", err)
196 | 	}
197 | 
198 | 	// XML output should contain XML tags
199 | 	if !strings.Contains(extracted, "<") {
200 | 		t.Logf("Warning: XML output doesn't seem to contain XML tags: %s", extracted[:min(100, len(extracted))])
201 | 	}
202 | 
203 | 	// Test with XML output disabled
204 | 	extractor2 := extractous.New().SetXmlOutput(false)
205 | 	if extractor2 == nil {
206 | 		t.Fatal("Failed to create extractor")
207 | 	}
208 | 	defer extractor2.Close()
209 | 
210 | 	extracted2, _, err := extractor2.ExtractFileToString(filePath)
211 | 	if err != nil {
212 | 		t.Fatalf("ExtractFileToString failed: %v", err)
213 | 	}
214 | 
215 | 	if strings.Contains(extracted2, "Test content") {
216 | 		t.Log("Plain text extraction successful")
217 | 	}
218 | }
219 | 
220 | // ============================================================================
221 | // Metadata Tests
222 | // ============================================================================
223 | 
224 | func TestIntegration_MetadataExtraction(t *testing.T) {
225 | 	content := "Test content"
226 | 	filePath := createTestFile(t, "metadata_test.txt", content)
227 | 	defer os.Remove(filePath)
228 | 
229 | 	extractor := extractous.New()
230 | 	if extractor == nil {
231 | 		t.Fatal("Failed to create extractor")
232 | 	}
233 | 	defer extractor.Close()
234 | 
235 | 	_, metadata, err := extractor.ExtractFileToString(filePath)
236 | 	if err != nil {
237 | 		t.Fatalf("ExtractFileToString failed: %v", err)
238 | 	}
239 | 
240 | 	if metadata == nil {
241 | 		t.Fatal("Expected non-nil metadata")
242 | 	}
243 | 
244 | 	// Test metadata methods
245 | 	if !metadata.Has("Content-Type") {
246 | 		t.Log("Warning: Content-Type not found in metadata")
247 | 	}
248 | 
249 | 	keys := metadata.Keys()
250 | 	if len(keys) == 0 {
251 | 		t.Error("Expected some metadata keys")
252 | 	}
253 | 
254 | 	t.Logf("Metadata keys: %v", keys)
255 | 
256 | 	// Test Get method
257 | 	for _, key := range keys {
258 | 		value := metadata.Get(key)
259 | 		if value == "" {
260 | 			t.Errorf("Get returned empty string for existing key: %s", key)
261 | 		}
262 | 		t.Logf("%s: %s", key, value)
263 | 	}
264 | 
265 | 	// Test GetAll method
266 | 	for _, key := range keys {
267 | 		values := metadata.GetAll(key)
268 | 		if len(values) == 0 {
269 | 			t.Errorf("GetAll returned nil/empty for existing key: %s", key)
270 | 		}
271 | 	}
272 | }
273 | 
274 | func TestIntegration_MetadataWithMultipleValues(t *testing.T) {
275 | 	// Some metadata fields can have multiple values (comma-separated)
276 | 	content := "Test content"
277 | 	filePath := createTestFile(t, "multi_meta_test.txt", content)
278 | 	defer os.Remove(filePath)
279 | 
280 | 	extractor := extractous.New()
281 | 	if extractor == nil {
282 | 		t.Fatal("Failed to create extractor")
283 | 	}
284 | 	defer extractor.Close()
285 | 
286 | 	_, metadata, err := extractor.ExtractFileToString(filePath)
287 | 	if err != nil {
288 | 		t.Fatalf("ExtractFileToString failed: %v", err)
289 | 	}
290 | 
291 | 	// Check if any metadata has multiple values
292 | 	for _, key := range metadata.Keys() {
293 | 		values := metadata.GetAll(key)
294 | 		if len(values) > 1 {
295 | 			t.Logf("Key '%s' has multiple values: %v", key, values)
296 | 		}
297 | 	}
298 | }
299 | 
300 | // ============================================================================
301 | // Error Handling Tests
302 | // ============================================================================
303 | 
304 | func TestIntegration_NonexistentFile(t *testing.T) {
305 | 	extractor := extractous.New()
306 | 	if extractor == nil {
307 | 		t.Fatal("Failed to create extractor")
308 | 	}
309 | 	defer extractor.Close()
310 | 
311 | 	_, _, err := extractor.ExtractFileToString("/nonexistent/file.txt")
312 | 	if err == nil {
313 | 		t.Error("Expected error for nonexistent file")
314 | 	}
315 | }
316 | 
317 | func TestIntegration_EmptyFile(t *testing.T) {
318 | 	filePath := createTestFile(t, "empty.txt", "")
319 | 	defer os.Remove(filePath)
320 | 
321 | 	extractor := extractous.New()
322 | 	if extractor == nil {
323 | 		t.Fatal("Failed to create extractor")
324 | 	}
325 | 	defer extractor.Close()
326 | 
327 | 	extracted, metadata, err := extractor.ExtractFileToString(filePath)
328 | 	if err != nil {
329 | 		t.Fatalf("ExtractFileToString failed: %v", err)
330 | 	}
331 | 
332 | 	if extracted != "" {
333 | 		t.Logf("Note: Empty file produced content: %s", extracted)
334 | 	}
335 | 
336 | 	if metadata == nil {
337 | 		t.Error("Expected non-nil metadata even for empty file")
338 | 	}
339 | }
340 | 
341 | // ============================================================================
342 | // Concurrency Tests
343 | // ============================================================================
344 | 
345 | func TestIntegration_ConcurrentExtraction(t *testing.T) {
346 | 	content := "Concurrent test content"
347 | 	filePath := createTestFile(t, "concurrent_test.txt", content)
348 | 	defer os.Remove(filePath)
349 | 
350 | 	const numGoroutines = 10
351 | 	errors := make(chan error, numGoroutines)
352 | 
353 | 	for i := range numGoroutines {
354 | 		go func(id int) {
355 | 			extractor := extractous.New()
356 | 			if extractor == nil {
357 | 				errors <- nil // Signal completion even on nil
358 | 				return
359 | 			}
360 | 			defer extractor.Close()
361 | 
362 | 			extracted, _, err := extractor.ExtractFileToString(filePath)
363 | 			if err != nil {
364 | 				errors <- err
365 | 				return
366 | 			}
367 | 
368 | 			if !strings.Contains(extracted, content) {
369 | 				errors <- nil
370 | 				return
371 | 			}
372 | 
373 | 			errors <- nil // Success
374 | 		}(i)
375 | 	}
376 | 
377 | 	// Wait for all goroutines
378 | 	for i := 0; i < numGoroutines; i++ {
379 | 		err := <-errors
380 | 		if err != nil {
381 | 			t.Errorf("Goroutine failed: %v", err)
382 | 		}
383 | 	}
384 | }
385 | 
386 | func TestIntegration_MultipleExtractorsSameFile(t *testing.T) {
387 | 	content := "Multiple extractors test"
388 | 	filePath := createTestFile(t, "multi_ext_test.txt", content)
389 | 	defer os.Remove(filePath)
390 | 
391 | 	extractors := make([]*extractous.Extractor, 5)
392 | 	for i := range extractors {
393 | 		extractors[i] = extractous.New()
394 | 		if extractors[i] == nil {
395 | 			t.Fatal("Failed to create extractor")
396 | 		}
397 | 		defer extractors[i].Close()
398 | 	}
399 | 
400 | 	// All extractors extract the same file
401 | 	for i, ext := range extractors {
402 | 		extracted, _, err := ext.ExtractFileToString(filePath)
403 | 		if err != nil {
404 | 			t.Errorf("Extractor %d failed: %v", i, err)
405 | 		}
406 | 		if !strings.Contains(extracted, content) {
407 | 			t.Errorf("Extractor %d didn't extract correct content", i)
408 | 		}
409 | 	}
410 | }
411 | 
412 | // ============================================================================
413 | // Helper Functions
414 | // ============================================================================
415 | 
416 | func min(a, b int) int {
417 | 	if a < b {
418 | 		return a
419 | 	}
420 | 	return b
421 | }
422 | 


--------------------------------------------------------------------------------
/extractous.h:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Extractous FFI - C Interface
  3 |  * 
  4 |  * This header file provides a C-compatible interface to the Extractous
  5 |  * document extraction library. It is safe for use with Go via cgo or any
  6 |  * C-compatible FFI system.
  7 |  *
  8 |  * License: Apache-2.0
  9 |  * Repository: https://github.com/rahulpoonia229/extractous-go
 10 |  *
 11 |  * MEMORY MANAGEMENT:
 12 |  * All pointers returned by Extractous functions must be freed using the function extractous_free_string.
 13 |  * Failure to do so will result in memory leaks.
 14 |  *
 15 |  *
 16 |  * CGO USAGE:
 17 |  *   // #cgo CFLAGS: -I${SRCDIR}/include
 18 |  *   // #cgo LDFLAGS: -L${SRCDIR}/lib -lextractous_ffi
 19 |  *   // #cgo linux LDFLAGS: -Wl,-rpath,$ORIGIN
 20 |  *   // #cgo darwin LDFLAGS: -Wl,-rpath,@loader_path
 21 |  *   // #include "extractous.h"
 22 |  *   import "C"
 23 |  */
 24 | 
 25 | 
 26 | #ifndef EXTRACTOUS_H
 27 | #define EXTRACTOUS_H
 28 | 
 29 | #include <stdarg.h>
 30 | #include <stdbool.h>
 31 | #include <stdint.h>
 32 | #include <stdlib.h>
 33 | 
 34 | #define ERR_OK 0
 35 | 
 36 | #define ERR_NULL_POINTER -1
 37 | 
 38 | #define ERR_INVALID_UTF8 -2
 39 | 
 40 | #define ERR_INVALID_STRING -3
 41 | 
 42 | #define ERR_EXTRACTION_FAILED -4
 43 | 
 44 | #define ERR_IO_ERROR -5
 45 | 
 46 | #define ERR_INVALID_CONFIG -6
 47 | 
 48 | #define ERR_INVALID_ENUM -7
 49 | 
 50 | #define ERR_UNSUPPORTED_FORMAT -8
 51 | 
 52 | #define ERR_OUT_OF_MEMORY -9
 53 | 
 54 | #define ERR_OCR_FAILED -10
 55 | 
 56 | #define CHARSET_UTF_8 0
 57 | 
 58 | #define CHARSET_US_ASCII 1
 59 | 
 60 | #define CHARSET_UTF_16BE 3
 61 | 
 62 | #define PDF_OCR_STRATEGY_NO_OCR 0
 63 | 
 64 | #define PDF_OCR_STRATEGY_OCR_ONLY 1
 65 | 
 66 | #define PDF_OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION 2
 67 | 
 68 | #define PDF_OCR_STRATEGY_AUTO 3
 69 | 
 70 | typedef struct CPdfParserConfig {
 71 |   uint8_t _private[0];
 72 | } CPdfParserConfig;
 73 | 
 74 | typedef struct COfficeParserConfig {
 75 |   uint8_t _private[0];
 76 | } COfficeParserConfig;
 77 | 
 78 | typedef struct CTesseractOcrConfig {
 79 |   uint8_t _private[0];
 80 | } CTesseractOcrConfig;
 81 | 
 82 | typedef struct CExtractor {
 83 |   uint8_t _private[0];
 84 | } CExtractor;
 85 | 
 86 | typedef struct CMetadata {
 87 |   /*
 88 |    Array of pointers to null-terminated key strings
 89 |    */
 90 |   char **keys;
 91 |   /*
 92 |    Array of pointers to null-terminated value strings
 93 |    */
 94 |   char **values;
 95 |   /*
 96 |    The number of key-value pairs in the arrays
 97 |    */
 98 |   size_t len;
 99 | } CMetadata;
100 | 
101 | typedef struct CStreamReader {
102 |   uint8_t _private[0];
103 | } CStreamReader;
104 | 
105 | /*
106 |  Returns the FFI wrapper version as a null-terminated UTF-8 string.
107 |  The returned pointer is to a static string and must not be freed.
108 |  */
109 | const char *extractous_ffi_version(void);
110 | 
111 | /*
112 |  Returns the underlying Extractous core library version.
113 |  The returned pointer is to a static string and must not be freed.
114 |  */
115 | const char *extractous_core_version(void);
116 | 
117 | /*
118 |  Creates a new PDF parser configuration with default settings.
119 |  The returned handle must be freed with `extractous_pdf_config_free()`
120 |  unless passed to an extractor, which will take ownership.
121 |  */
122 | struct CPdfParserConfig *extractous_pdf_config_new(void);
123 | 
124 | /*
125 |  Frees the memory associated with a PDF parser configuration.
126 |  Do not call this if the config has been attached to an extractor.
127 |  */
128 | void extractous_pdf_config_free(struct CPdfParserConfig *handle);
129 | 
130 | /*
131 |  Sets the OCR strategy for PDF parsing. Modifies the config in-place.
132 |  */
133 | void extractous_pdf_config_set_ocr_strategy(struct CPdfParserConfig *handle, int strategy);
134 | 
135 | /*
136 |  Enables or disables extraction of inline images. Modifies the config in-place.
137 |  */
138 | void extractous_pdf_config_set_extract_inline_images(struct CPdfParserConfig *handle, bool value);
139 | 
140 | /*
141 |  If enabled, only unique inline images (by digest) will be extracted.
142 |  */
143 | void extractous_pdf_config_set_extract_unique_inline_images_only(struct CPdfParserConfig *handle,
144 |                                                                  bool value);
145 | 
146 | /*
147 |  Enables or disables extraction of text from marked content sections.
148 |  */
149 | void extractous_pdf_config_set_extract_marked_content(struct CPdfParserConfig *handle, bool value);
150 | 
151 | /*
152 |  Enables or disables extraction of text from annotations.
153 |  */
154 | void extractous_pdf_config_set_extract_annotation_text(struct CPdfParserConfig *handle, bool value);
155 | 
156 | /*
157 |  Creates a new Office parser configuration with default settings.
158 |  */
159 | struct COfficeParserConfig *extractous_office_config_new(void);
160 | 
161 | /*
162 |  Frees the memory associated with an Office parser configuration.
163 |  */
164 | void extractous_office_config_free(struct COfficeParserConfig *handle);
165 | 
166 | /*
167 |  Enables or disables macro extraction. Modifies the config in-place.
168 |  */
169 | void extractous_office_config_set_extract_macros(struct COfficeParserConfig *handle, bool value);
170 | 
171 | /*
172 |  Enables or disables inclusion of deleted content (track changes).
173 |  */
174 | void extractous_office_config_set_include_deleted_content(struct COfficeParserConfig *handle,
175 |                                                           bool value);
176 | 
177 | /*
178 |  Enables or disables inclusion of moved-from content (track changes).
179 |  */
180 | void extractous_office_config_set_include_move_from_content(struct COfficeParserConfig *handle,
181 |                                                             bool value);
182 | 
183 | /*
184 |  Enables or disables inclusion of content from shapes.
185 |  */
186 | void extractous_office_config_set_include_shape_based_content(struct COfficeParserConfig *handle,
187 |                                                               bool value);
188 | 
189 | /*
190 |  Creates a new Tesseract OCR configuration with default settings.
191 |  */
192 | struct CTesseractOcrConfig *extractous_ocr_config_new(void);
193 | 
194 | /*
195 |  Frees the memory associated with a Tesseract OCR configuration.
196 |  */
197 | void extractous_ocr_config_free(struct CTesseractOcrConfig *handle);
198 | 
199 | /*
200 |  Sets the OCR language. Modifies the config in-place.
201 |  */
202 | void extractous_ocr_config_set_language(struct CTesseractOcrConfig *handle, const char *language);
203 | 
204 | /*
205 |  Sets the DPI for OCR processing. Modifies the config in-place.
206 |  */
207 | void extractous_ocr_config_set_density(struct CTesseractOcrConfig *handle, int32_t density);
208 | 
209 | /*
210 |  Sets the bit depth for OCR processing.
211 |  */
212 | void extractous_ocr_config_set_depth(struct CTesseractOcrConfig *handle, int32_t depth);
213 | 
214 | /*
215 |  Enables or disables image preprocessing for OCR.
216 |  */
217 | void extractous_ocr_config_set_enable_image_preprocessing(struct CTesseractOcrConfig *handle,
218 |                                                           bool value);
219 | 
220 | /*
221 |  Sets the timeout for the Tesseract process in seconds.
222 |  */
223 | void extractous_ocr_config_set_timeout_seconds(struct CTesseractOcrConfig *handle, int32_t seconds);
224 | 
225 | char *extractous_error_message(int code);
226 | 
227 | /*
228 |  Retrieves a detailed debug report for the last error on this thread
229 |  full error chain and a backtrace if RUST_BACKTRACE=1
230 |  */
231 | char *extractous_error_get_last_debug(void);
232 | 
233 | /*
234 |  Checks if debug information is available for the current thread
235 |  */
236 | int extractous_error_has_debug(void);
237 | 
238 | void extractous_error_clear_last(void);
239 | 
240 | /*
241 |  Creates a new `Extractor` with a default configuration.
242 |  The returned handle must be freed with `extractous_extractor_free`.
243 |  */
244 | struct CExtractor *extractous_extractor_new(void);
245 | 
246 | /*
247 |  Frees the memory associated with an `Extractor` handle.
248 |  */
249 | void extractous_extractor_free(struct CExtractor *handle);
250 | 
251 | /*
252 |  Sets the maximum length for extracted string content.
253 |  */
254 | void extractous_extractor_set_extract_string_max_length_mut(struct CExtractor *handle,
255 |                                                             int max_length);
256 | 
257 | /*
258 |  Sets the character encoding for the extracted text.
259 |  */
260 | void extractous_extractor_set_encoding_mut(struct CExtractor *handle, int encoding);
261 | 
262 | /*
263 |  Sets the configuration for the PDF parser.
264 |  */
265 | void extractous_extractor_set_pdf_config_mut(struct CExtractor *handle,
266 |                                              const struct CPdfParserConfig *config);
267 | 
268 | /*
269 |  Sets the configuration for the Office document parser.
270 |  */
271 | void extractous_extractor_set_office_config_mut(struct CExtractor *handle,
272 |                                                 const struct COfficeParserConfig *config);
273 | 
274 | /*
275 |  Sets the configuration for Tesseract OCR.
276 |  */
277 | void extractous_extractor_set_ocr_config_mut(struct CExtractor *handle,
278 |                                              const struct CTesseractOcrConfig *config);
279 | 
280 | /*
281 |  Sets whether to output structured XML instead of plain text.
282 |  */
283 | void extractous_extractor_set_xml_output_mut(struct CExtractor *handle, bool xml_output);
284 | 
285 | /*
286 |  Extracts content and metadata from a local file path into a string.
287 | 
288 |  Output strings must be freed with `extractous_string_free`.
289 |  Output metadata must be freed with `extractous_metadata_free`.
290 |  */
291 | int extractous_extractor_extract_file_to_string(struct CExtractor *handle,
292 |                                                 const char *path,
293 |                                                 char **out_content,
294 |                                                 struct CMetadata **out_metadata);
295 | 
296 | /*
297 |  Extracts content and metadata from a local file path into a stream.
298 |  */
299 | int extractous_extractor_extract_file(struct CExtractor *handle,
300 |                                       const char *path,
301 |                                       struct CStreamReader **out_reader,
302 |                                       struct CMetadata **out_metadata);
303 | 
304 | /*
305 |  Extracts content and metadata from a byte slice into a string.
306 |  */
307 | int extractous_extractor_extract_bytes_to_string(struct CExtractor *handle,
308 |                                                  const uint8_t *data,
309 |                                                  size_t data_len,
310 |                                                  char **out_content,
311 |                                                  struct CMetadata **out_metadata);
312 | 
313 | /*
314 |  Extracts content and metadata from a byte slice into a stream.
315 |  */
316 | int extractous_extractor_extract_bytes(struct CExtractor *handle,
317 |                                        const uint8_t *data,
318 |                                        size_t data_len,
319 |                                        struct CStreamReader **out_reader,
320 |                                        struct CMetadata **out_metadata);
321 | 
322 | /*
323 |  Extracts content and metadata from a URL into a string.
324 |  */
325 | int extractous_extractor_extract_url_to_string(struct CExtractor *handle,
326 |                                                const char *url,
327 |                                                char **out_content,
328 |                                                struct CMetadata **out_metadata);
329 | 
330 | /*
331 |  Extracts content and metadata from a URL into a stream.
332 |  */
333 | int extractous_extractor_extract_url(struct CExtractor *handle,
334 |                                      const char *url,
335 |                                      struct CStreamReader **out_reader,
336 |                                      struct CMetadata **out_metadata);
337 | 
338 | /*
339 |  Frees a C-style string that was allocated by this library.
340 |  */
341 | void extractous_string_free(char *s);
342 | 
343 | /*
344 |  Frees a metadata structure and all associated memory.
345 |  */
346 | void extractous_metadata_free(struct CMetadata *metadata);
347 | 
348 | /*
349 |  Reads data from a stream into a user-provided buffer.
350 | 
351 |  Returns the actual number of bytes read via the `bytes_read` output parameter.
352 |  Reaching the end of the stream is indicated by `ERR_OK` and `*bytes_read == 0`.
353 |  */
354 | int extractous_stream_read(struct CStreamReader *handle,
355 |                            uint8_t *buffer,
356 |                            size_t buffer_size,
357 |                            size_t *bytes_read);
358 | 
359 | /*
360 |  Reads exactly `buffer_size` bytes from the stream.
361 | 
362 |  Function will continue reading until the buffer is full, or the end of
363 |  the stream is reached, or an error occurs.
364 |  */
365 | int extractous_stream_read_exact(struct CStreamReader *handle,
366 |                                  uint8_t *buffer,
367 |                                  size_t buffer_size,
368 |                                  size_t *bytes_read);
369 | 
370 | /*
371 |  Reads the remaining stream into a newly allocated buffer.
372 |  */
373 | int extractous_stream_read_all(struct CStreamReader *handle,
374 |                                uint8_t **out_buffer,
375 |                                size_t *out_size);
376 | 
377 | /*
378 |  Frees a buffer allocated by `extractous_stream_read_all`.
379 |  */
380 | void extractous_buffer_free(uint8_t *buffer, size_t size);
381 | 
382 | /*
383 |  Frees a stream reader and releases its resources.
384 |  */
385 | void extractous_stream_free(struct CStreamReader *handle);
386 | 
387 | #endif  /* EXTRACTOUS_H */
388 | 


--------------------------------------------------------------------------------
/cmd/install/main.go:
--------------------------------------------------------------------------------
  1 | // go run github.com/rahulpoonia29/extractous-go/cmd/install@latest
  2 | package main
  3 | 
  4 | import (
  5 | 	"archive/tar"
  6 | 	"archive/zip"
  7 | 	"compress/gzip"
  8 | 	"context"
  9 | 	"encoding/json"
 10 | 	"flag"
 11 | 	"fmt"
 12 | 	"io"
 13 | 	"log"
 14 | 	"math"
 15 | 	"net/http"
 16 | 	"os"
 17 | 	"path/filepath"
 18 | 	"runtime"
 19 | 	"sort"
 20 | 	"strconv"
 21 | 	"strings"
 22 | 	"time"
 23 | 
 24 | 	"github.com/schollz/progressbar/v3"
 25 | )
 26 | 
 27 | const (
 28 | 	repoOwner = "rahulpoonia29"
 29 | 	repoName  = "extractous-go"
 30 | 	nativeDir = "native"
 31 | )
 32 | 
 33 | type platformList []string
 34 | 
 35 | func (p *platformList) String() string {
 36 | 	return strings.Join(*p, ", ")
 37 | }
 38 | 
 39 | func (p *platformList) Set(value string) error {
 40 | 	*p = append(*p, value)
 41 | 	return nil
 42 | }
 43 | 
 44 | var (
 45 | 	verbose bool
 46 | 	client  = http.DefaultClient
 47 | )
 48 | 
 49 | func main() {
 50 | 	var platforms platformList
 51 | 	var listPlatforms, downloadAll bool
 52 | 
 53 | 	flag.Var(&platforms, "platform", "Specify a platform to download (e.g., linux_amd64). Can be used multiple times.")
 54 | 	flag.BoolVar(&listPlatforms, "list", false, "List available platforms from the latest release and exit.")
 55 | 	flag.BoolVar(&downloadAll, "all", false, "Download all available platforms from the latest release.")
 56 | 	flag.BoolVar(&verbose, "v", false, "Verbose logging")
 57 | 	flag.Parse()
 58 | 
 59 | 	// use logging for errors and info (timestamps)
 60 | 	log.SetFlags(0) // keep messages clean
 61 | 	infof("Fetching Extractous FFI release information from GitHub...")
 62 | 
 63 | 	availablePlatforms, err := getAvailablePlatforms()
 64 | 	if err != nil {
 65 | 		fatalf("Error retrieving available platforms: %v", err)
 66 | 	}
 67 | 
 68 | 	if listPlatforms {
 69 | 		printAvailablePlatforms(availablePlatforms)
 70 | 		return
 71 | 	}
 72 | 
 73 | 	platformsToDownload := determinePlatformsToDownload(platforms, downloadAll, availablePlatforms)
 74 | 	if len(platformsToDownload) == 0 {
 75 | 		infof("No platforms selected for download.")
 76 | 		infof("Available platforms (run with --list to view):")
 77 | 		printAvailablePlatforms(availablePlatforms)
 78 | 		infof("To install for this machine run without flags, or pass --platform for the platform you want.")
 79 | 		return
 80 | 	}
 81 | 
 82 | 	infof("Platforms selected for installation: %s", strings.Join(platformsToDownload, ", "))
 83 | 
 84 | 	for _, platform := range platformsToDownload {
 85 | 		archiveURL, ok := availablePlatforms[platform]
 86 | 		if !ok {
 87 | 			log.Printf("Warning: Platform '%s' not found in latest release. Skipping.", platform)
 88 | 			continue
 89 | 		}
 90 | 
 91 | 		infof("Downloading release for platform: %s", platform)
 92 | 
 93 | 		archivePath, err := downloadFileWithRetries(archiveURL, 3)
 94 | 		if err != nil {
 95 | 			fatalf("Failed to download asset for %s: %v", platform, err)
 96 | 		}
 97 | 		// ensure cleanup of downloaded archive
 98 | 		defer os.Remove(archivePath)
 99 | 
100 | 		archiveFormat := "tar.gz"
101 | 		if strings.HasSuffix(archiveURL, ".zip") {
102 | 			archiveFormat = "zip"
103 | 		}
104 | 
105 | 		if err := extractArchive(archivePath, nativeDir, platform, archiveFormat); err != nil {
106 | 			// attempt cleanup of partial extraction
107 | 			destPath := filepath.Join(nativeDir, platform)
108 | 			_ = os.RemoveAll(destPath)
109 | 			fatalf("Failed to extract archive for %s: %v", platform, err)
110 | 		}
111 | 		infof("Libraries for %s extracted to ./%s/%s", platform, nativeDir, platform)
112 | 	}
113 | 
114 | 	infof("Installation completed successfully.")
115 | }
116 | 
117 | func infof(format string, args ...interface{}) {
118 | 	fmt.Printf(format+"\n", args...)
119 | }
120 | 
121 | func fatalf(format string, args ...interface{}) {
122 | 	log.Fatalf(format, args...)
123 | }
124 | 
125 | func printAvailablePlatforms(platforms map[string]string) {
126 | 	if len(platforms) == 0 {
127 | 		fmt.Println("  (no platforms found)")
128 | 		return
129 | 	}
130 | 	names := make([]string, 0, len(platforms))
131 | 	for n := range platforms {
132 | 		names = append(names, n)
133 | 	}
134 | 	sort.Strings(names)
135 | 	for _, name := range names {
136 | 		fmt.Printf("  - %s\n", name)
137 | 	}
138 | }
139 | 
140 | func determinePlatformsToDownload(platforms platformList, downloadAll bool, availablePlatforms map[string]string) []string {
141 | 	if downloadAll {
142 | 		keys := make([]string, 0, len(availablePlatforms))
143 | 		for k := range availablePlatforms {
144 | 			keys = append(keys, k)
145 | 		}
146 | 		sort.Strings(keys)
147 | 		return keys
148 | 	}
149 | 
150 | 	if len(platforms) > 0 {
151 | 		return platforms
152 | 	}
153 | 
154 | 	currentPlatform, _ := getPlatformAndFormat()
155 | 	if _, ok := availablePlatforms[currentPlatform]; ok {
156 | 		return []string{currentPlatform}
157 | 	}
158 | 
159 | 	// not found for current platform
160 | 	return []string{}
161 | }
162 | 
163 | func getAvailablePlatforms() (map[string]string, error) {
164 | 	apiURL := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", repoOwner, repoName)
165 | 
166 | 	var resp *http.Response
167 | 	var err error
168 | 
169 | 	// simple retry here too
170 | 	for attempt := 0; attempt < 3; attempt++ {
171 | 		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
172 | 		defer cancel()
173 | 		req, _ := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, nil)
174 | 		req.Header.Set("Accept", "application/vnd.github.v3+json")
175 | 		resp, err = client.Do(req)
176 | 		if err == nil {
177 | 			break
178 | 		}
179 | 		wait := time.Duration(math.Pow(2, float64(attempt))) * time.Second
180 | 		time.Sleep(wait)
181 | 	}
182 | 	if err != nil {
183 | 		return nil, fmt.Errorf("failed to query GitHub API: %w", err)
184 | 	}
185 | 	defer resp.Body.Close()
186 | 
187 | 	if resp.StatusCode == http.StatusForbidden {
188 | 		remaining := resp.Header.Get("X-RateLimit-Remaining")
189 | 		reset := resp.Header.Get("X-RateLimit-Reset")
190 | 		if remaining == "0" && reset != "" {
191 | 			// parse reset as unix timestamp
192 | 			if ts, err := strconv.ParseInt(reset, 10, 64); err == nil {
193 | 				resetTime := time.Unix(ts, 0).Local()
194 | 				// Round to nearest minute for nicer display
195 | 				duration := max(time.Until(resetTime), 0)
196 | 				humanWait := fmt.Sprintf("about %d min", int(duration.Minutes()+0.5))
197 | 
198 | 				return nil, fmt.Errorf(
199 | 					"GitHub API rate limit exceeded.\nLimit resets at: %s (%s from now)\nTip: set a personal access token to increase your limit",
200 | 					resetTime.Format("Mon 2 15:04 MST"),
201 | 					humanWait,
202 | 				)
203 | 			}
204 | 		}
205 | 		return nil, fmt.Errorf("access forbidden from GitHub API: %s", resp.Status)
206 | 	}
207 | 
208 | 	if resp.StatusCode != http.StatusOK {
209 | 		return nil, fmt.Errorf("unexpected status from GitHub API: %s", resp.Status)
210 | 	}
211 | 
212 | 	var releaseInfo struct {
213 | 		Assets []struct {
214 | 			Name        string `json:"name"`
215 | 			DownloadURL string `json:"browser_download_url"`
216 | 		} `json:"assets"`
217 | 	}
218 | 
219 | 	if err := json.NewDecoder(resp.Body).Decode(&releaseInfo); err != nil {
220 | 		return nil, fmt.Errorf("failed to decode GitHub release info: %w", err)
221 | 	}
222 | 
223 | 	platforms := make(map[string]string)
224 | 	for _, asset := range releaseInfo.Assets {
225 | 		if !strings.HasPrefix(asset.Name, "extractous-ffi-") {
226 | 			continue
227 | 		}
228 | 		// skip checksum assets like .sha256
229 | 		if strings.HasSuffix(asset.Name, ".sha256") || strings.HasSuffix(asset.Name, ".sha256.txt") {
230 | 			if verbose {
231 | 				log.Printf("Skipping checksum asset: %s", asset.Name)
232 | 			}
233 | 			continue
234 | 		}
235 | 		after := strings.TrimPrefix(asset.Name, "extractous-ffi-")
236 | 		name := strings.TrimSuffix(after, ".zip")
237 | 		name = strings.TrimSuffix(name, ".tar.gz")
238 | 		name = strings.TrimSuffix(name, ".tgz")
239 | 		platforms[name] = asset.DownloadURL
240 | 	}
241 | 
242 | 	if len(platforms) == 0 {
243 | 		return nil, fmt.Errorf("no compatible FFI assets found in the latest release")
244 | 	}
245 | 
246 | 	return platforms, nil
247 | }
248 | 
249 | func getPlatformAndFormat() (platform, format string) {
250 | 	goos := runtime.GOOS
251 | 	goarch := runtime.GOARCH
252 | 
253 | 	switch goos {
254 | 	case "linux":
255 | 		return fmt.Sprintf("linux_%s", goarch), "tar.gz"
256 | 	case "darwin":
257 | 		return fmt.Sprintf("darwin_%s", goarch), "tar.gz"
258 | 	case "windows":
259 | 		return fmt.Sprintf("windows_%s", goarch), "zip"
260 | 	default:
261 | 		fatalf("Unsupported operating system: %s", goos)
262 | 		return "", ""
263 | 	}
264 | }
265 | 
266 | // downloadFileWithRetries will try a few times and show a progress bar.
267 | func downloadFileWithRetries(url string, attempts int) (string, error) {
268 | 	var lastErr error
269 | 	for i := 1; i <= attempts; i++ {
270 | 		if i > 1 {
271 | 			// backoff
272 | 			backoff := time.Duration(i*i) * time.Second
273 | 			if verbose {
274 | 				log.Printf("Retrying in %s...", backoff)
275 | 			}
276 | 			time.Sleep(backoff)
277 | 		}
278 | 		path, err := downloadFile(url)
279 | 		if err == nil {
280 | 			return path, nil
281 | 		}
282 | 		lastErr = err
283 | 		if verbose {
284 | 			log.Printf("Attempt %d/%d failed: %v", i, attempts, err)
285 | 		}
286 | 	}
287 | 	return "", fmt.Errorf("download failed after %d attempts: %w", attempts, lastErr)
288 | }
289 | 
290 | func downloadFile(url string) (string, error) {
291 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
292 | 	defer cancel()
293 | 	req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
294 | 	resp, err := client.Do(req)
295 | 	if err != nil {
296 | 		return "", err
297 | 	}
298 | 	defer resp.Body.Close()
299 | 
300 | 	if resp.StatusCode != http.StatusOK {
301 | 		return "", fmt.Errorf("bad status: %s", resp.Status)
302 | 	}
303 | 
304 | 	tmpFile, err := os.CreateTemp("", "extractous-*.download")
305 | 	if err != nil {
306 | 		return "", err
307 | 	}
308 | 	defer tmpFile.Close()
309 | 
310 | 	bar := progressbar.NewOptions64(
311 | 		resp.ContentLength,
312 | 		progressbar.OptionSetWidth(30),
313 | 		progressbar.OptionShowBytes(true),
314 | 		progressbar.OptionSetDescription("Downloading"),
315 | 		progressbar.OptionShowCount(),
316 | 		progressbar.OptionShowElapsedTimeOnFinish(),
317 | 		progressbar.OptionSetTheme(progressbar.Theme{
318 | 			Saucer:        "=",
319 | 			SaucerHead:    ">",
320 | 			SaucerPadding: " ",
321 | 			BarStart:      "[",
322 | 			BarEnd:        "]",
323 | 		}),
324 | 	)
325 | 	if _, err = io.Copy(io.MultiWriter(tmpFile, bar), resp.Body); err != nil {
326 | 		return "", err
327 | 	}
328 | 	println("")
329 | 
330 | 	return tmpFile.Name(), nil
331 | }
332 | 
333 | func extractArchive(src, dest, platform, format string) error {
334 | 	destPath := filepath.Join(dest, platform)
335 | 	if err := os.MkdirAll(destPath, 0o755); err != nil {
336 | 		return err
337 | 	}
338 | 
339 | 	switch format {
340 | 	case "zip":
341 | 		return unzip(src, destPath)
342 | 	case "tar.gz":
343 | 		return untar(src, destPath)
344 | 	default:
345 | 		return fmt.Errorf("unsupported archive format: %s", format)
346 | 	}
347 | }
348 | 
349 | // prevent zip-slip and path traversal by resolving absolute paths
350 | func safeJoin(dest, name string) (string, error) {
351 | 	absDest, err := filepath.Abs(dest)
352 | 	if err != nil {
353 | 		return "", err
354 | 	}
355 | 	cleanName := filepath.Clean(strings.ReplaceAll(name, "\\", string(os.PathSeparator)))
356 | 	joined := filepath.Join(absDest, cleanName)
357 | 	absJoined, err := filepath.Abs(joined)
358 | 	if err != nil {
359 | 		return "", err
360 | 	}
361 | 	// allow the file to be exactly the dest dir or inside it
362 | 	if absJoined == absDest || strings.HasPrefix(absJoined, absDest+string(os.PathSeparator)) {
363 | 		return absJoined, nil
364 | 	}
365 | 	return "", fmt.Errorf("illegal file path outside destination: %s", name)
366 | }
367 | 
368 | func unzip(src, dest string) error {
369 | 	r, err := zip.OpenReader(src)
370 | 	if err != nil {
371 | 		return err
372 | 	}
373 | 	defer r.Close()
374 | 
375 | 	for _, f := range r.File {
376 | 		// use forward slashes in zip entries; convert for local FS
377 | 		fname := filepath.FromSlash(f.Name)
378 | 		targetPath, err := safeJoin(dest, fname)
379 | 		if err != nil {
380 | 			return err
381 | 		}
382 | 
383 | 		if f.FileInfo().IsDir() {
384 | 			if err := os.MkdirAll(targetPath, f.Mode()); err != nil {
385 | 				return err
386 | 			}
387 | 			continue
388 | 		}
389 | 
390 | 		if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
391 | 			return err
392 | 		}
393 | 
394 | 		outFile, err := os.OpenFile(targetPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
395 | 		if err != nil {
396 | 			return err
397 | 		}
398 | 
399 | 		rc, err := f.Open()
400 | 		if err != nil {
401 | 			outFile.Close()
402 | 			return err
403 | 		}
404 | 
405 | 		if _, err = io.Copy(outFile, rc); err != nil {
406 | 			outFile.Close()
407 | 			rc.Close()
408 | 			return err
409 | 		}
410 | 
411 | 		outFile.Close()
412 | 		rc.Close()
413 | 	}
414 | 	return nil
415 | }
416 | 
417 | func untar(src, dest string) error {
418 | 	file, err := os.Open(src)
419 | 	if err != nil {
420 | 		return err
421 | 	}
422 | 	defer file.Close()
423 | 
424 | 	gzr, err := gzip.NewReader(file)
425 | 	if err != nil {
426 | 		return err
427 | 	}
428 | 	defer gzr.Close()
429 | 
430 | 	tr := tar.NewReader(gzr)
431 | 
432 | 	for {
433 | 		header, err := tr.Next()
434 | 		if err == io.EOF {
435 | 			return nil
436 | 		}
437 | 		if err != nil {
438 | 			return err
439 | 		}
440 | 
441 | 		// Clean header name to avoid path traversal
442 | 		name := header.Name
443 | 		if name == "" {
444 | 			continue
445 | 		}
446 | 		targetPath, err := safeJoin(dest, name)
447 | 		if err != nil {
448 | 			return err
449 | 		}
450 | 
451 | 		switch header.Typeflag {
452 | 		case tar.TypeDir:
453 | 			if err := os.MkdirAll(targetPath, 0o755); err != nil {
454 | 				return err
455 | 			}
456 | 		case tar.TypeReg, tar.TypeRegA:
457 | 			if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
458 | 				return err
459 | 			}
460 | 			outFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR|os.O_TRUNC, os.FileMode(header.Mode))
461 | 			if err != nil {
462 | 				return err
463 | 			}
464 | 			if _, err := io.Copy(outFile, tr); err != nil {
465 | 				outFile.Close()
466 | 				return err
467 | 			}
468 | 			outFile.Close()
469 | 		case tar.TypeSymlink, tar.TypeLink:
470 | 			// skip symlinks for safety
471 | 			if verbose {
472 | 				log.Printf("Skipping symlink: %s", header.Name)
473 | 			}
474 | 		default:
475 | 			if verbose {
476 | 				log.Printf("Skipping unknown tar entry type %c for %s", header.Typeflag, header.Name)
477 | 			}
478 | 		}
479 | 	}
480 | }
481 | 


--------------------------------------------------------------------------------
/tests/ffi/test_ffi_interface.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * FFI Layer Tests for Extractous
  3 |  * 
  4 |  * These tests verify that the C FFI interface works correctly
  5 |  * and all functions are properly exposed from the Rust library.
  6 |  */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <assert.h>
 12 | #include "../../include/extractous.h"
 13 | 
 14 | // Test result tracking
 15 | static int tests_run = 0;
 16 | static int tests_passed = 0;
 17 | static int tests_failed = 0;
 18 | 
 19 | // Color codes for output
 20 | #define COLOR_GREEN "\x1b[32m"
 21 | #define COLOR_RED "\x1b[31m"
 22 | #define COLOR_YELLOW "\x1b[33m"
 23 | #define COLOR_RESET "\x1b[0m"
 24 | 
 25 | // Test macros
 26 | #define TEST(name) \
 27 |     void test_##name(); \
 28 |     void run_test_##name() { \
 29 |         tests_run++; \
 30 |         printf("[ RUN  ] %s\n", #name); \
 31 |         test_##name(); \
 32 |         tests_passed++; \
 33 |         printf(COLOR_GREEN "[  OK  ] %s\n" COLOR_RESET, #name); \
 34 |     } \
 35 |     void test_##name()
 36 | 
 37 | #define ASSERT_NOT_NULL(ptr, msg) \
 38 |     if (ptr == NULL) { \
 39 |         printf(COLOR_RED "[FAILED] %s: %s is NULL\n" COLOR_RESET, __func__, msg); \
 40 |         tests_failed++; \
 41 |         tests_passed--; \
 42 |         return; \
 43 |     }
 44 | 
 45 | #define ASSERT_NULL(ptr, msg) \
 46 |     if (ptr != NULL) { \
 47 |         printf(COLOR_RED "[FAILED] %s: %s is not NULL\n" COLOR_RESET, __func__, msg); \
 48 |         tests_failed++; \
 49 |         tests_passed--; \
 50 |         return; \
 51 |     }
 52 | 
 53 | #define ASSERT_EQ(expected, actual, msg) \
 54 |     if (expected != actual) { \
 55 |         printf(COLOR_RED "[FAILED] %s: %s - expected %d, got %d\n" COLOR_RESET, \
 56 |                __func__, msg, expected, actual); \
 57 |         tests_failed++; \
 58 |         tests_passed--; \
 59 |         return; \
 60 |     }
 61 | 
 62 | #define ASSERT_TRUE(condition, msg) \
 63 |     if (!(condition)) { \
 64 |         printf(COLOR_RED "[FAILED] %s: %s\n" COLOR_RESET, __func__, msg); \
 65 |         tests_failed++; \
 66 |         tests_passed--; \
 67 |         return; \
 68 |     }
 69 | 
 70 | // ============================================================================
 71 | // Test: Extractor Lifecycle
 72 | // ============================================================================
 73 | 
 74 | TEST(extractor_new) {
 75 |     struct CExtractor *extractor = extractous_extractor_new();
 76 |     ASSERT_NOT_NULL(extractor, "extractor");
 77 |     extractous_extractor_free(extractor);
 78 | }
 79 | 
 80 | TEST(extractor_free_null) {
 81 |     // Should not crash
 82 |     extractous_extractor_free(NULL);
 83 | }
 84 | 
 85 | TEST(extractor_double_free) {
 86 |     struct CExtractor *extractor = extractous_extractor_new();
 87 |     ASSERT_NOT_NULL(extractor, "extractor");
 88 |     extractous_extractor_free(extractor);
 89 |     // Second free on same pointer would cause issues in real code
 90 |     // but this test just verifies it doesn't crash the suite
 91 | }
 92 | 
 93 | // ============================================================================
 94 | // Test: Configuration Functions
 95 | // ============================================================================
 96 | 
 97 | TEST(extractor_set_max_length) {
 98 |     struct CExtractor *extractor = extractous_extractor_new();
 99 |     ASSERT_NOT_NULL(extractor, "extractor");
100 |     
101 |     struct CExtractor *new_extractor = extractous_extractor_set_extract_string_max_length(
102 |         extractor, 10000
103 |     );
104 |     ASSERT_NOT_NULL(new_extractor, "new_extractor");
105 |     
106 |     extractous_extractor_free(new_extractor);
107 | }
108 | 
109 | TEST(extractor_set_encoding) {
110 |     struct CExtractor *extractor = extractous_extractor_new();
111 |     ASSERT_NOT_NULL(extractor, "extractor");
112 |     
113 |     struct CExtractor *new_extractor = extractous_extractor_set_encoding(
114 |         extractor, CHARSET_UTF_8
115 |     );
116 |     ASSERT_NOT_NULL(new_extractor, "new_extractor with UTF-8");
117 |     
118 |     extractous_extractor_free(new_extractor);
119 | }
120 | 
121 | TEST(extractor_set_invalid_encoding) {
122 |     struct CExtractor *extractor = extractous_extractor_new();
123 |     ASSERT_NOT_NULL(extractor, "extractor");
124 |     
125 |     struct CExtractor *new_extractor = extractous_extractor_set_encoding(
126 |         extractor, 999 // Invalid encoding
127 |     );
128 |     ASSERT_NULL(new_extractor, "new_extractor with invalid encoding");
129 |     
130 |     // Original extractor was consumed, don't free
131 | }
132 | 
133 | TEST(extractor_set_xml_output) {
134 |     struct CExtractor *extractor = extractous_extractor_new();
135 |     ASSERT_NOT_NULL(extractor, "extractor");
136 |     
137 |     struct CExtractor *new_extractor = extractous_extractor_set_xml_output(
138 |         extractor, true
139 |     );
140 |     ASSERT_NOT_NULL(new_extractor, "new_extractor with XML enabled");
141 |     
142 |     extractous_extractor_free(new_extractor);
143 | }
144 | 
145 | TEST(extractor_chained_configuration) {
146 |     struct CExtractor *e1 = extractous_extractor_new();
147 |     ASSERT_NOT_NULL(e1, "e1");
148 |     
149 |     struct CExtractor *e2 = extractous_extractor_set_extract_string_max_length(e1, 5000);
150 |     ASSERT_NOT_NULL(e2, "e2");
151 |     
152 |     struct CExtractor *e3 = extractous_extractor_set_encoding(e2, CHARSET_UTF_8);
153 |     ASSERT_NOT_NULL(e3, "e3");
154 |     
155 |     struct CExtractor *e4 = extractous_extractor_set_xml_output(e3, false);
156 |     ASSERT_NOT_NULL(e4, "e4");
157 |     
158 |     extractous_extractor_free(e4);
159 | }
160 | 
161 | // ============================================================================
162 | // Test: PDF Configuration
163 | // ============================================================================
164 | 
165 | TEST(pdf_config_new) {
166 |     struct CPdfParserConfig *config = extractous_pdf_config_new();
167 |     ASSERT_NOT_NULL(config, "pdf_config");
168 |     extractous_pdf_config_free(config);
169 | }
170 | 
171 | TEST(pdf_config_set_ocr_strategy) {
172 |     struct CPdfParserConfig *c1 = extractous_pdf_config_new();
173 |     ASSERT_NOT_NULL(c1, "c1");
174 |     
175 |     struct CPdfParserConfig *c2 = extractous_pdf_config_set_ocr_strategy(
176 |         c1, PDF_OCR_AUTO
177 |     );
178 |     ASSERT_NOT_NULL(c2, "c2");
179 |     
180 |     extractous_pdf_config_free(c2);
181 | }
182 | 
183 | TEST(pdf_config_set_extract_inline_images) {
184 |     struct CPdfParserConfig *c1 = extractous_pdf_config_new();
185 |     ASSERT_NOT_NULL(c1, "c1");
186 |     
187 |     struct CPdfParserConfig *c2 = extractous_pdf_config_set_extract_inline_images(c1, true);
188 |     ASSERT_NOT_NULL(c2, "c2");
189 |     
190 |     extractous_pdf_config_free(c2);
191 | }
192 | 
193 | TEST(extractor_set_pdf_config) {
194 |     struct CExtractor *extractor = extractous_extractor_new();
195 |     ASSERT_NOT_NULL(extractor, "extractor");
196 |     
197 |     struct CPdfParserConfig *pdf_config = extractous_pdf_config_new();
198 |     ASSERT_NOT_NULL(pdf_config, "pdf_config");
199 |     
200 |     struct CExtractor *new_extractor = extractous_extractor_set_pdf_config(
201 |         extractor, pdf_config
202 |     );
203 |     ASSERT_NOT_NULL(new_extractor, "new_extractor");
204 |     
205 |     extractous_pdf_config_free(pdf_config);
206 |     extractous_extractor_free(new_extractor);
207 | }
208 | 
209 | // ============================================================================
210 | // Test: Office Configuration
211 | // ============================================================================
212 | 
213 | TEST(office_config_new) {
214 |     struct COfficeParserConfig *config = extractous_office_config_new();
215 |     ASSERT_NOT_NULL(config, "office_config");
216 |     extractous_office_config_free(config);
217 | }
218 | 
219 | TEST(office_config_set_extract_macros) {
220 |     struct COfficeParserConfig *c1 = extractous_office_config_new();
221 |     ASSERT_NOT_NULL(c1, "c1");
222 |     
223 |     struct COfficeParserConfig *c2 = extractous_office_config_set_extract_macros(c1, true);
224 |     ASSERT_NOT_NULL(c2, "c2");
225 |     
226 |     extractous_office_config_free(c2);
227 | }
228 | 
229 | // ============================================================================
230 | // Test: OCR Configuration
231 | // ============================================================================
232 | 
233 | TEST(ocr_config_new) {
234 |     struct CTesseractOcrConfig *config = extractous_ocr_config_new();
235 |     ASSERT_NOT_NULL(config, "ocr_config");
236 |     extractous_ocr_config_free(config);
237 | }
238 | 
239 | TEST(ocr_config_set_language) {
240 |     struct CTesseractOcrConfig *c1 = extractous_ocr_config_new();
241 |     ASSERT_NOT_NULL(c1, "c1");
242 |     
243 |     struct CTesseractOcrConfig *c2 = extractous_ocr_config_set_language(c1, "eng");
244 |     ASSERT_NOT_NULL(c2, "c2");
245 |     
246 |     extractous_ocr_config_free(c2);
247 | }
248 | 
249 | // ============================================================================
250 | // Test: Error Handling
251 | // ============================================================================
252 | 
253 | TEST(error_message) {
254 |     char *msg = extractous_error_message(ERR_OK);
255 |     ASSERT_NOT_NULL(msg, "error message for ERR_OK");
256 |     ASSERT_TRUE(strlen(msg) > 0, "error message not empty");
257 |     extractous_string_free(msg);
258 |     
259 |     msg = extractous_error_message(ERR_NULL_POINTER);
260 |     ASSERT_NOT_NULL(msg, "error message for ERR_NULL_POINTER");
261 |     extractous_string_free(msg);
262 |     
263 |     msg = extractous_error_message(ERR_EXTRACTION_FAILED);
264 |     ASSERT_NOT_NULL(msg, "error message for ERR_EXTRACTION_FAILED");
265 |     extractous_string_free(msg);
266 | }
267 | 
268 | TEST(extract_with_null_extractor) {
269 |     char *content = NULL;
270 |     struct CMetadata *metadata = NULL;
271 |     
272 |     int result = extractous_extractor_extract_file_to_string(
273 |         NULL, "test.txt", &content, &metadata
274 |     );
275 |     
276 |     ASSERT_EQ(ERR_NULL_POINTER, result, "error code");
277 | }
278 | 
279 | TEST(extract_with_null_path) {
280 |     struct CExtractor *extractor = extractous_extractor_new();
281 |     ASSERT_NOT_NULL(extractor, "extractor");
282 |     
283 |     char *content = NULL;
284 |     struct CMetadata *metadata = NULL;
285 |     
286 |     int result = extractous_extractor_extract_file_to_string(
287 |         extractor, NULL, &content, &metadata
288 |     );
289 |     
290 |     ASSERT_EQ(ERR_NULL_POINTER, result, "error code");
291 |     extractous_extractor_free(extractor);
292 | }
293 | 
294 | TEST(extract_with_null_output) {
295 |     struct CExtractor *extractor = extractous_extractor_new();
296 |     ASSERT_NOT_NULL(extractor, "extractor");
297 |     
298 |     int result = extractous_extractor_extract_file_to_string(
299 |         extractor, "test.txt", NULL, NULL
300 |     );
301 |     
302 |     ASSERT_EQ(ERR_NULL_POINTER, result, "error code");
303 |     extractous_extractor_free(extractor);
304 | }
305 | 
306 | // ============================================================================
307 | // Test: String Memory Management
308 | // ============================================================================
309 | 
310 | TEST(string_free_null) {
311 |     // Should not crash
312 |     extractous_string_free(NULL);
313 | }
314 | 
315 | // ============================================================================
316 | // Test: Metadata Functions
317 | // ============================================================================
318 | 
319 | TEST(metadata_free_null) {
320 |     // Should not crash
321 |     extractous_metadata_free(NULL);
322 | }
323 | 
324 | // ============================================================================
325 | // Test: URL Extraction Functions (if they exist)
326 | // ============================================================================
327 | 
328 | TEST(url_extraction_null_checks) {
329 |     struct CExtractor *extractor = extractous_extractor_new();
330 |     ASSERT_NOT_NULL(extractor, "extractor");
331 |     
332 |     char *content = NULL;
333 |     struct CMetadata *metadata = NULL;
334 |     
335 |     // NULL URL
336 |     int result = extractous_extractor_extract_url_to_string(
337 |         extractor, NULL, &content, &metadata
338 |     );
339 |     ASSERT_EQ(ERR_NULL_POINTER, result, "null URL error code");
340 |     
341 |     // NULL outputs
342 |     result = extractous_extractor_extract_url_to_string(
343 |         extractor, "http://example.com", NULL, NULL
344 |     );
345 |     ASSERT_EQ(ERR_NULL_POINTER, result, "null outputs error code");
346 |     
347 |     extractous_extractor_free(extractor);
348 | }
349 | 
350 | // ============================================================================
351 | // Test Runner
352 | // ============================================================================
353 | 
354 | void run_all_tests() {
355 |     printf("\n");
356 |     printf("========================================\n");
357 |     printf("  FFI Layer Tests for Extractous\n");
358 |     printf("========================================\n\n");
359 |     
360 |     // Lifecycle tests
361 |     printf(COLOR_YELLOW "--- Extractor Lifecycle ---\n" COLOR_RESET);
362 |     run_test_extractor_new();
363 |     run_test_extractor_free_null();
364 |     run_test_extractor_double_free();
365 |     
366 |     // Configuration tests
367 |     printf(COLOR_YELLOW "\n--- Configuration Functions ---\n" COLOR_RESET);
368 |     run_test_extractor_set_max_length();
369 |     run_test_extractor_set_encoding();
370 |     run_test_extractor_set_invalid_encoding();
371 |     run_test_extractor_set_xml_output();
372 |     run_test_extractor_chained_configuration();
373 |     
374 |     // PDF config tests
375 |     printf(COLOR_YELLOW "\n--- PDF Configuration ---\n" COLOR_RESET);
376 |     run_test_pdf_config_new();
377 |     run_test_pdf_config_set_ocr_strategy();
378 |     run_test_pdf_config_set_extract_inline_images();
379 |     run_test_extractor_set_pdf_config();
380 |     
381 |     // Office config tests
382 |     printf(COLOR_YELLOW "\n--- Office Configuration ---\n" COLOR_RESET);
383 |     run_test_office_config_new();
384 |     run_test_office_config_set_extract_macros();
385 |     
386 |     // OCR config tests
387 |     printf(COLOR_YELLOW "\n--- OCR Configuration ---\n" COLOR_RESET);
388 |     run_test_ocr_config_new();
389 |     run_test_ocr_config_set_language();
390 |     
391 |     // Error handling tests
392 |     printf(COLOR_YELLOW "\n--- Error Handling ---\n" COLOR_RESET);
393 |     run_test_error_message();
394 |     run_test_extract_with_null_extractor();
395 |     run_test_extract_with_null_path();
396 |     run_test_extract_with_null_output();
397 |     
398 |     // Memory management tests
399 |     printf(COLOR_YELLOW "\n--- Memory Management ---\n" COLOR_RESET);
400 |     run_test_string_free_null();
401 |     run_test_metadata_free_null();
402 |     
403 |     // URL extraction tests
404 |     printf(COLOR_YELLOW "\n--- URL Extraction ---\n" COLOR_RESET);
405 |     run_test_url_extraction_null_checks();
406 |     
407 |     // Summary
408 |     printf("\n");
409 |     printf("========================================\n");
410 |     printf("  Test Summary\n");
411 |     printf("========================================\n");
412 |     printf("Total:  %d\n", tests_run);
413 |     printf(COLOR_GREEN "Passed: %d\n" COLOR_RESET, tests_passed);
414 |     
415 |     if (tests_failed > 0) {
416 |         printf(COLOR_RED "Failed: %d\n" COLOR_RESET, tests_failed);
417 |     } else {
418 |         printf("Failed: 0\n");
419 |     }
420 |     
421 |     printf("========================================\n\n");
422 | }
423 | 
424 | int main() {
425 |     run_all_tests();
426 |     return tests_failed > 0 ? 1 : 0;
427 | }
428 | 


--------------------------------------------------------------------------------