├── .envrc ├── .gitignore ├── static └── shrimp.jpg ├── tailwind.config.js ├── flake.lock ├── flake.nix ├── Cargo.toml ├── LICENSE ├── src ├── display.rs ├── state.rs ├── config.rs ├── node.rs ├── extract.rs ├── cli.rs ├── browser.rs ├── graph.rs ├── format.rs └── main.rs ├── README.md └── templates └── index.html /.envrc: -------------------------------------------------------------------------------- 1 | use flake 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | debug/ 2 | target/ 3 | 4 | Cargo.lock 5 | 6 | **/*.rs.bk 7 | 8 | *.pdb 9 | -------------------------------------------------------------------------------- /static/shrimp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noahfraiture/coma/HEAD/static/shrimp.jpg -------------------------------------------------------------------------------- /tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | darkMode: "media", // Enables dark mode based on the user's system preference 4 | theme: { 5 | extend: {}, 6 | }, 7 | variants: { 8 | extend: {}, 9 | }, 10 | plugins: [], 11 | }; 12 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "nixpkgs": { 4 | "locked": { 5 | "lastModified": 1738142207, 6 | "narHash": "sha256-NGqpVVxNAHwIicXpgaVqJEJWeyqzoQJ9oc8lnK9+WC4=", 7 | "owner": "nixos", 8 | "repo": "nixpkgs", 9 | "rev": "9d3ae807ebd2981d593cddd0080856873139aa40", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "nixos", 14 | "ref": "nixos-unstable", 15 | "repo": "nixpkgs", 16 | "type": "github" 17 | } 18 | }, 19 | "root": { 20 | "inputs": { 21 | "nixpkgs": "nixpkgs" 22 | } 23 | } 24 | }, 25 | "root": "root", 26 | "version": 7 27 | } 28 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs = { 3 | nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; 4 | }; 5 | 6 | outputs = 7 | { self, nixpkgs }: 8 | let 9 | pkgs = import nixpkgs { 10 | system = "x86_64-linux"; 11 | config.allowUnfree = true; 12 | }; 13 | in 14 | { 15 | devShells."x86_64-linux".default = pkgs.mkShell { 16 | 17 | buildInputs = with pkgs; [ 18 | pkg-config 19 | ]; 20 | 21 | packages = with pkgs; [ 22 | cargo 23 | rustc 24 | openssl 25 | ]; 26 | 27 | shellHook = '' 28 | export DIRENV='rust' 29 | ''; 30 | }; 31 | }; 32 | } 33 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "coma" 3 | version = "0.2.3" 4 | edition = "2021" 5 | authors = ["Noah "] 6 | license = "MIT" 7 | description = "Coma is a lightweight command-line tool designed for crawling websites" 8 | repository = "https://github.com/noahfraiture/coma" 9 | readme = "README.md" 10 | 11 | keywords = ["scraping", "crawler", "web-discovery"] 12 | categories = ["web-programming"] 13 | 14 | [dependencies] 15 | anyhow = "1.0.86" 16 | askama = { version = "0.12.1", features = ["serde-json"] } 17 | chrono = "0.4.38" 18 | clap = { version = "4.5.11", features = ["derive"] } 19 | colored = "2.1.0" 20 | futures = "0.3.30" 21 | headless_chrome = { version = "1.0.12", features = ["fetch"] } 22 | markup5ever = "0.12.1" 23 | reqwest = { version = "0.12.5", features = ["blocking"] } 24 | scraper = "0.19.1" 25 | serde = { version = "1.0.204", features = ["derive", "rc"] } 26 | serde_json = "1.0.127" 27 | tempfile = "3.11.0" 28 | tokio = { version = "1.39.2", features = ["full"] } 29 | url = "2.5.2" 30 | webbrowser = "1.0.1" 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Noah Fraiture 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/display.rs: -------------------------------------------------------------------------------- 1 | use crate::cli::{Display, Format}; 2 | use crate::graph; 3 | use crate::node::Node; 4 | 5 | impl Node { 6 | pub fn display(node: &mut Node, cmd: &Display) -> std::result::Result<(), CommandError> { 7 | match cmd { 8 | Display::Print { format: _ } => { 9 | println!("{}", node.output.as_ref().unwrap()) 10 | } 11 | Display::Save { format, name } => { 12 | let output = node.output.as_ref().unwrap(); 13 | let extension = match format { 14 | Format::Json => "json", 15 | Format::Raw => "txt", 16 | }; 17 | 18 | let file_name = format!("{name}.{extension}"); 19 | let path = Path::new(&file_name); 20 | 21 | let mut file = File::create(path)?; 22 | file.write_all(output.as_bytes())?; 23 | } 24 | Display::Graph => graph::render(node)?, 25 | } 26 | Ok(()) 27 | } 28 | } 29 | 30 | use std::error; 31 | use std::fmt; 32 | use std::fs::File; 33 | use std::io::Write; 34 | use std::path::Path; 35 | 36 | #[derive(Debug)] 37 | pub enum CommandError { 38 | Graph, 39 | IO(std::io::Error), 40 | } 41 | 42 | impl fmt::Display for CommandError { 43 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 44 | write!(f, "error in command data") 45 | } 46 | } 47 | 48 | impl From for CommandError { 49 | fn from(_: graph::GraphError) -> Self { 50 | CommandError::Graph 51 | } 52 | } 53 | 54 | impl From for CommandError { 55 | fn from(value: std::io::Error) -> Self { 56 | CommandError::IO(value) 57 | } 58 | } 59 | 60 | impl error::Error for CommandError {} 61 | -------------------------------------------------------------------------------- /src/state.rs: -------------------------------------------------------------------------------- 1 | use crate::node::Node; 2 | use std::{ 3 | collections::LinkedList, 4 | sync::{Arc, Mutex}, 5 | }; 6 | 7 | pub struct State { 8 | pub current_depth: i32, 9 | pub current_external: i32, 10 | visited: Vec>>, 11 | layers: LinkedList>>>, 12 | pub current_layer: Vec>>, 13 | } 14 | 15 | impl State { 16 | pub fn new(root: Arc>) -> Result> { 17 | // Queue of vector of the discovered link at the current depth 18 | // Each node of the linkedlist is a depth 19 | let mut layers: LinkedList>>> = LinkedList::new(); 20 | layers.push_back(vec![root]); 21 | 22 | // This will never be used and could be None 23 | let current_layer = Vec::new(); 24 | 25 | Ok(State { 26 | current_depth: 0, 27 | current_external: 0, 28 | visited: Vec::new(), 29 | layers, 30 | current_layer, 31 | }) 32 | } 33 | 34 | pub fn pop_layer(&mut self) -> Option<()> { 35 | self.current_layer = self.layers.pop_front()?; 36 | Some(()) 37 | } 38 | 39 | pub fn add_to_next_layer(&mut self, links: Vec>>) { 40 | if self.layers.front().is_none() { 41 | self.layers.push_back(Vec::new()); 42 | } 43 | self.layers.front_mut().unwrap().extend(links); 44 | } 45 | 46 | pub fn known(&mut self, node: &Arc>) -> bool { 47 | for visited_node in self.visited.iter() { 48 | if visited_node.lock().unwrap().eq(&node.lock().unwrap()) { 49 | return true; 50 | } 51 | } 52 | self.visited.push(Arc::clone(node)); 53 | false 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::sync::{Arc, Mutex}; 3 | 4 | use crate::cli; 5 | use crate::node::Node; 6 | use colored::Colorize; 7 | use url::Url; 8 | 9 | pub struct Config { 10 | pub domain: String, 11 | pub root: Arc>, 12 | pub args: cli::Cli, 13 | } 14 | 15 | impl Config { 16 | pub fn new() -> Result> { 17 | let args = cli::args()?; 18 | 19 | // NOTE: browser must still exist or the connection is closed. Pretty weird to not have 20 | let origin_url = Url::parse(&args.url).map_err(|e| ConfigError::Message(e.to_string()))?; 21 | origin_url 22 | .domain() 23 | .ok_or("Url doesn't have a domain") 24 | .map_err(|e| ConfigError::Message(e.to_string()))?; 25 | let domain = origin_url.domain().unwrap().to_owned(); 26 | let id = origin_url.clone().to_string(); 27 | 28 | Ok(Config { 29 | domain, 30 | root: Node::new_arc(None, origin_url, id), 31 | args, 32 | }) 33 | } 34 | 35 | pub fn same_domain(&self, url: &Url) -> bool { 36 | url.domain().unwrap_or("") == self.domain 37 | } 38 | 39 | pub fn in_bound(&self, url: &Url) -> bool { 40 | url.as_str().contains(&self.args.bound) 41 | } 42 | } 43 | 44 | pub enum ConfigError { 45 | Message(String), 46 | } 47 | 48 | impl ConfigError { 49 | fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 50 | match self { 51 | ConfigError::Message(s) => { 52 | write!(f, "{}: {}", "Config error".red(), s) 53 | } 54 | } 55 | } 56 | } 57 | 58 | impl fmt::Display for ConfigError { 59 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 60 | self.print(f) 61 | } 62 | } 63 | 64 | impl fmt::Debug for ConfigError { 65 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 66 | self.print(f) 67 | } 68 | } 69 | 70 | impl std::error::Error for ConfigError {} 71 | -------------------------------------------------------------------------------- /src/node.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{Arc, Mutex, Weak}; 2 | use url::Url; 3 | 4 | pub struct Node { 5 | pub id: String, 6 | pub url: Url, 7 | // TODO : should I move while Node behind of mutex instead of most field ? 8 | // TODO : I could use a color to show difference between explored and unexplored node 9 | pub explored: bool, // flag used to know if it will be rendered 10 | // Every node will own every images on the page 11 | // More logic that every node own a copy of the url to the image 12 | 13 | // Mutex is need to borrow mutability of Arc 14 | pub images: Option>, 15 | // Can't directly use scraper::node::{Comment, Text} since their aren't Send/Sync 16 | // Could try later to impl these trait 17 | pub comments: Option>, 18 | pub texts: Option>, 19 | pub inputs: Option>, 20 | pub links: Option>, 21 | pub children: Vec>>, 22 | pub parents: Vec>>, 23 | 24 | // Output already formatted as wanted 25 | pub output: Option, 26 | } 27 | 28 | impl std::hash::Hash for Node { 29 | fn hash(&self, state: &mut H) { 30 | self.id.hash(state); 31 | self.url.hash(state); 32 | } 33 | } 34 | 35 | impl PartialEq for Node { 36 | fn eq(&self, other: &Self) -> bool { 37 | other.id == self.id && other.url == self.url 38 | } 39 | } 40 | 41 | impl Eq for Node {} 42 | 43 | impl Node { 44 | pub fn new_arc(parent: Option<&Arc>>, url: Url, id: String) -> Arc> { 45 | let node = Arc::new(Mutex::new(Node { 46 | id, 47 | url, 48 | explored: false, 49 | images: None, 50 | comments: None, 51 | texts: None, 52 | inputs: None, 53 | links: None, 54 | children: vec![], 55 | parents: parent.map_or_else(Vec::new, |p| vec![Arc::downgrade(p)]), 56 | output: None, 57 | })); 58 | if let Some(parent) = parent { 59 | parent.lock().unwrap().add_child(&node); 60 | }; 61 | node 62 | } 63 | 64 | pub fn explore( 65 | node: &Arc>, 66 | func: Visitor, 67 | ) -> Result<(), Box> { 68 | func(&mut node.lock().unwrap())?; 69 | for child in &node.lock().unwrap().children { 70 | if !child.lock().unwrap().explored { 71 | continue; 72 | } 73 | Node::explore(child, func)?; 74 | } 75 | Ok(()) 76 | } 77 | 78 | pub fn add_child(&mut self, child: &Arc>) { 79 | self.children.push(Arc::clone(child)) 80 | } 81 | 82 | pub fn quantity_elements(&self) -> usize { 83 | self.images.as_ref().unwrap_or(&vec![]).len() 84 | + self.comments.as_ref().unwrap_or(&vec![]).len() 85 | + self.texts.as_ref().unwrap_or(&vec![]).len() 86 | } 87 | } 88 | 89 | type Visitor<'a> = &'a mut dyn FnMut(&mut Node) -> Result<(), Box>; 90 | -------------------------------------------------------------------------------- /src/extract.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::HashSet, 3 | sync::{Arc, Mutex}, 4 | }; 5 | 6 | use markup5ever::local_name; 7 | use scraper::{node::Element, Html}; 8 | use url::Url; 9 | 10 | use crate::node; 11 | 12 | // TODO : add format 13 | pub fn extract_links(url: &Url, page: &Html) -> HashSet { 14 | HashSet::from_iter(page.tree.values().filter_map(|v| match v { 15 | scraper::Node::Element(element) => { 16 | let element = element.to_owned(); 17 | 18 | // Ensure this is a link 19 | if !matches!(element.name.local, local_name!("a")) { 20 | return None; 21 | } 22 | 23 | // We want the attribute "href" 24 | for (key, value) in &element.attrs { 25 | if matches!(key.local, local_name!("href")) { 26 | return Url::join(url, value).ok(); 27 | } 28 | } 29 | None 30 | } 31 | _ => None, 32 | })) 33 | } 34 | 35 | pub fn extract_comments(node: &Arc>, page: &Html) { 36 | node.lock().unwrap().comments = Some( 37 | page.tree 38 | .values() 39 | .filter_map(|v| match v { 40 | scraper::Node::Comment(comment) => { 41 | Some(comment.to_string()).filter(|v| !v.is_empty()) 42 | } 43 | _ => None, 44 | }) 45 | .collect(), 46 | ); 47 | } 48 | 49 | pub fn extract_texts(node: &Arc>, page: &Html) { 50 | node.lock().unwrap().texts = Some( 51 | page.tree 52 | .values() 53 | .filter_map(|v| match v { 54 | scraper::Node::Text(text) => Some(text.to_string()), 55 | _ => None, 56 | }) 57 | .collect(), 58 | ); 59 | } 60 | 61 | pub fn extract_element(page: &Html, filter: F) -> Vec 62 | where 63 | F: Fn(Element) -> Option, 64 | { 65 | page.tree 66 | .values() 67 | .filter_map(|v| match v { 68 | scraper::Node::Element(element) => { 69 | let element = element.to_owned(); 70 | filter(element) 71 | } 72 | _ => None, 73 | }) 74 | .collect() 75 | } 76 | 77 | pub fn extract_images(node: &Arc>, page: &Html) { 78 | node.lock().unwrap().images = Some(extract_element(page, |element: Element| { 79 | if matches!(element.name.local, local_name!("img")) { 80 | for (key, value) in &element.attrs { 81 | if matches!(key.local, local_name!("src")) { 82 | // If the url is absolute, the value will replace the base url 83 | return Url::join(&node.lock().unwrap().url, value).ok(); 84 | } 85 | } 86 | } 87 | None 88 | })); 89 | } 90 | 91 | // TODO : better rendering 92 | pub fn extract_input(node: &Arc>, page: &Html) { 93 | node.lock().unwrap().inputs = Some(extract_element(page, |element: Element| { 94 | if matches!(element.name.local, local_name!("input")) { 95 | Some(format!("{:?}", element)) 96 | } else { 97 | None 98 | } 99 | })); 100 | } 101 | -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | use core::fmt; 2 | 3 | use clap::{Parser, Subcommand}; 4 | use colored::Colorize; 5 | use url::Url; 6 | 7 | /// Website scraper 8 | #[derive(Parser, Debug)] 9 | #[command(name = "Coma")] 10 | #[command(author = "Noah")] 11 | #[command(version)] 12 | #[command(help_template = " 13 | {name} - {about} 14 | 15 | Author: {author} 16 | Version: {version} 17 | 18 | {usage-heading} {usage} 19 | {all-args} {tab}")] 20 | pub struct Cli { 21 | /// Action to perform with the data 22 | #[command(subcommand)] 23 | pub cmd: Display, 24 | 25 | /// Content to scrap 26 | #[arg(short, long, value_delimiter = ',', default_value = "all")] 27 | pub content: Vec, 28 | 29 | /// Url to start the search 30 | #[arg(short, long)] 31 | pub url: String, 32 | 33 | /// Depth to search from the given url, 0 for only the current url, < 0 for infinite depth 34 | #[arg(short, long, default_value_t = 0, allow_negative_numbers = true)] 35 | pub depth: i32, 36 | 37 | /// Upper bound in the url, any url that doesn't contains this string will be ignored 38 | // TODO : change default to Option 39 | #[arg(short, long, default_value = "")] 40 | pub bound: String, 41 | 42 | /// Max number of concurrent thread 43 | #[arg(short, long, default_value_t = 5)] 44 | pub thread: u32, 45 | 46 | // Depth to external website with different domain. Depth have priority to stop the search 47 | #[arg(short, long, default_value_t = 0)] 48 | pub external: i32, 49 | } 50 | 51 | #[derive(Subcommand, Debug, Clone, PartialEq, Eq, Hash)] 52 | pub enum Display { 53 | /// Print the extracted content in the terminal 54 | Print { 55 | /// Format of the output 56 | #[arg()] 57 | format: Format, 58 | }, 59 | 60 | /// Save the extracted content in files 61 | Save { 62 | /// Format of the output 63 | #[arg()] 64 | format: Format, 65 | 66 | /// Name of the output file 67 | #[arg(short, long, default_value = "output")] 68 | name: String, 69 | }, 70 | 71 | /// Create a html topolgy 72 | Graph, 73 | } 74 | 75 | #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, serde::Serialize)] 76 | pub enum Content { 77 | /// Extract the text in the html 78 | Texts, 79 | 80 | /// Extract the comments in the html 81 | Comments, 82 | 83 | /// Extract the links found on the page 84 | Links, 85 | 86 | /// Extract the images of the page 87 | Images, 88 | 89 | /// Extract informations about any form 90 | Inputs, 91 | 92 | /// Extract all information and generate a topology 93 | All, 94 | } 95 | 96 | #[derive(clap::ValueEnum, Debug, Clone, PartialEq, Eq, Hash)] 97 | pub enum Format { 98 | /// Create a json file with the data 99 | Json, 100 | 101 | /// Raw data. Link are not raw href but joined with domain 102 | Raw, 103 | } 104 | 105 | pub enum ArgsError { 106 | InvalidUrl(String), 107 | } 108 | 109 | impl ArgsError { 110 | fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 111 | match self { 112 | ArgsError::InvalidUrl(url) => write!(f, "{}: {}", "Invalid URL".red(), url), 113 | } 114 | } 115 | } 116 | 117 | impl fmt::Display for ArgsError { 118 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 119 | self.print(f) 120 | } 121 | } 122 | 123 | impl fmt::Debug for ArgsError { 124 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 125 | self.print(f) 126 | } 127 | } 128 | 129 | impl std::error::Error for ArgsError {} 130 | 131 | pub fn args() -> Result { 132 | let args = Cli::parse(); 133 | 134 | match Url::parse(&args.url) { 135 | Ok(v) => { 136 | v.domain().ok_or(ArgsError::InvalidUrl(v.to_string()))?; 137 | Ok(args) 138 | } 139 | Err(e) => Err(ArgsError::InvalidUrl(e.to_string())), 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/browser.rs: -------------------------------------------------------------------------------- 1 | use colored::Colorize; 2 | use std::fmt; 3 | use std::sync::Arc; 4 | 5 | use anyhow::Result; 6 | use headless_chrome::LaunchOptions; 7 | use std::{collections::HashSet, sync::Mutex}; 8 | 9 | use scraper::Html; 10 | use url::Url; 11 | 12 | use crate::cli::Content; 13 | use crate::extract; 14 | use crate::node; 15 | 16 | pub struct Browser { 17 | #[allow(dead_code)] // need to keep the browser alive 18 | browser: headless_chrome::Browser, 19 | pub tab: Arc, 20 | } 21 | 22 | impl Browser { 23 | // These functions are used in async context 24 | // The separation of function is needed to send connection in async 25 | // task, but the Html can't be sent accros async task 26 | pub fn new_navigate(url: &Url) -> Result { 27 | let browser = headless_chrome::Browser::new( 28 | LaunchOptions::default_builder() 29 | .devtools(false) 30 | .build() 31 | .map_err(|e| BrowseError::Browser(e.to_string()))?, 32 | )?; 33 | let tab = browser.new_tab()?; 34 | tab.navigate_to(url.as_str())?; 35 | tab.wait_until_navigated()?; 36 | Ok(Self { browser, tab }) 37 | } 38 | 39 | // TODO : replace handle_... by the command and format 40 | // Add format 41 | // Extract useful information 42 | pub async fn parse_document( 43 | self, 44 | contents: &Vec, 45 | node: &Arc>, 46 | ) -> HashSet { 47 | let response = self.tab.get_content().unwrap(); 48 | let document = Html::parse_document(&response); 49 | let links = extract::extract_links(&node.lock().unwrap().url, &document); 50 | 51 | for content in contents { 52 | match content { 53 | Content::Texts => { 54 | extract::extract_texts(node, &document); 55 | } 56 | Content::Comments => { 57 | extract::extract_comments(node, &document); 58 | } 59 | Content::Links => { 60 | // NOTE : here we must clone even if normally we shouldn't have 61 | // twice the same content 62 | node.lock().unwrap().links = Some(links.clone().into_iter().collect()); 63 | } 64 | Content::Images => { 65 | extract::extract_images(node, &document); 66 | } 67 | Content::Inputs => { 68 | extract::extract_input(node, &document); 69 | } 70 | Content::All => { 71 | extract::extract_texts(node, &document); 72 | extract::extract_comments(node, &document); 73 | node.lock().unwrap().links = Some(links.clone().into_iter().collect()); 74 | extract::extract_images(node, &document); 75 | extract::extract_input(node, &document); 76 | } 77 | }; 78 | } 79 | links 80 | } 81 | } 82 | 83 | // NOTE: this is ok only because the browser is the only one using anyhow 84 | pub enum BrowseError { 85 | Browser(String), 86 | } 87 | 88 | impl BrowseError { 89 | fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 90 | match self { 91 | BrowseError::Browser(e) => write!(f, "{}: {}", "Browser error".red(), e), 92 | } 93 | } 94 | } 95 | 96 | impl fmt::Display for BrowseError { 97 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 98 | self.print(f) 99 | } 100 | } 101 | 102 | impl fmt::Debug for BrowseError { 103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | self.print(f) 105 | } 106 | } 107 | 108 | impl std::error::Error for BrowseError {} 109 | 110 | impl From for BrowseError { 111 | fn from(value: anyhow::Error) -> Self { 112 | BrowseError::Browser(value.to_string()) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/graph.rs: -------------------------------------------------------------------------------- 1 | use askama::Template; 2 | use colored::Colorize; 3 | use serde::Serialize; 4 | use std::{collections::HashSet, fmt, fs}; 5 | 6 | use crate::node::Node; 7 | 8 | #[derive(Template)] 9 | #[template(path = "index.html")] 10 | struct GraphTemplate { 11 | graph: Graph, 12 | } 13 | 14 | #[derive(Serialize, Debug, Clone)] 15 | struct Graph { 16 | nodes: HashSet, 17 | edges: HashSet, 18 | } 19 | 20 | // TODO: If i add more information, it could be great to implement 21 | // (PartialEq, Eq, Hash) to have accurate hashset 22 | #[derive(Serialize, Debug, Clone, Eq, PartialEq, Hash)] 23 | struct GraphNode { 24 | id: String, 25 | label: String, 26 | images: Vec, 27 | comments: Vec, 28 | inputs: Vec, 29 | } 30 | 31 | impl GraphNode { 32 | fn from_node(node: &Node) -> Self { 33 | Self { 34 | id: node.id.clone(), 35 | label: node.url.to_string(), 36 | images: node 37 | .images 38 | .as_deref() 39 | .unwrap_or_default() 40 | .iter() 41 | .map(|url| url.to_string()) 42 | .collect(), 43 | comments: node.comments.as_deref().unwrap_or_default().to_vec(), 44 | inputs: node.inputs.as_deref().unwrap_or_default().to_vec(), 45 | } 46 | } 47 | } 48 | 49 | #[derive(Serialize, Debug, Clone, Eq, PartialEq, Hash)] 50 | struct GraphEdge { 51 | from: String, 52 | to: String, 53 | } 54 | 55 | impl Graph { 56 | fn from_root(node: &Node) -> Self { 57 | let (mut nodes, mut edges) = (HashSet::::new(), HashSet::::new()); 58 | let graph_child = Graph::by_children(node); 59 | nodes.extend(graph_child.nodes); 60 | edges.extend(graph_child.edges); 61 | Graph { nodes, edges } 62 | } 63 | 64 | fn by_children(node: &Node) -> Self { 65 | let (mut nodes, mut edges) = ( 66 | HashSet::from([GraphNode::from_node(node)]), 67 | HashSet::::new(), 68 | ); 69 | for child in node.children.clone() { 70 | if !child.lock().unwrap().explored { 71 | continue; 72 | } 73 | edges.insert(GraphEdge { 74 | from: node.id.clone(), 75 | to: child.lock().unwrap().id.clone(), 76 | }); 77 | let graph = Graph::by_children(&child.lock().unwrap()); 78 | nodes.extend(graph.nodes); 79 | edges.extend(graph.edges); 80 | } 81 | Graph { nodes, edges } 82 | } 83 | } 84 | 85 | // NOTE : we suppose we search from the root and thus we will never need to look at parents 86 | // If we want to support multiple root, we'll have to rethink this 87 | pub fn render(root: &Node) -> Result<(), GraphError> { 88 | let template = GraphTemplate { 89 | graph: Graph::from_root(root), 90 | }; 91 | let html = template.render().map_err(|e| GraphError(e.to_string()))?; 92 | let mut temp_file_path = std::env::temp_dir(); 93 | temp_file_path.push(root.url.domain().unwrap().to_owned() + ".html"); 94 | fs::write(&temp_file_path, html).expect("Failed to write to named file"); 95 | let temp_file_path_str = temp_file_path.to_str().expect("Failed to get file path"); 96 | webbrowser::open(temp_file_path_str).expect("Failed to open in web browser"); 97 | Ok(()) 98 | } 99 | 100 | pub struct GraphError(String); 101 | 102 | impl GraphError { 103 | fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | write!(f, "{}: {}", "Graph error".red(), self.0) 105 | } 106 | } 107 | 108 | impl fmt::Display for GraphError { 109 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 110 | self.print(f) 111 | } 112 | } 113 | 114 | impl fmt::Debug for GraphError { 115 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 116 | self.print(f) 117 | } 118 | } 119 | 120 | impl std::error::Error for GraphError {} 121 | -------------------------------------------------------------------------------- /src/format.rs: -------------------------------------------------------------------------------- 1 | use url::Url; 2 | 3 | use crate::cli::{Content, Display, Format}; 4 | 5 | use super::node::Node; 6 | 7 | impl Node { 8 | pub fn format( 9 | node: &mut Node, 10 | contents: &Vec, 11 | cmd: &Display, 12 | ) -> std::result::Result<(), FormatError> { 13 | let format = match cmd { 14 | Display::Print { format } | Display::Save { format, .. } => format, 15 | _ => return Err(FormatError::Graph), 16 | }; 17 | 18 | match format { 19 | Format::Json => Node::aggregate_json(node, contents), 20 | Format::Raw => Node::aggregate_raw(node, contents), 21 | } 22 | } 23 | 24 | fn aggregate_json( 25 | node: &mut Node, 26 | contents: &Vec, 27 | ) -> std::result::Result<(), FormatError> { 28 | let mut datas: Vec = Vec::new(); 29 | for content in contents { 30 | datas.append(&mut Self::format_json(node, content)); 31 | } 32 | node.output = serde_json::to_string(&datas).ok(); 33 | Ok(()) 34 | } 35 | 36 | fn aggregate_raw( 37 | node: &mut Node, 38 | contents: &Vec, 39 | ) -> std::result::Result<(), FormatError> { 40 | let mut datas: Vec = Vec::new(); 41 | for content in contents { 42 | datas.append(&mut Self::format_raw(node, content)) 43 | } 44 | node.output = Some(datas.join("\n")); 45 | Ok(()) 46 | } 47 | 48 | fn format_raw(node: &mut Node, content: &Content) -> Vec { 49 | match content { 50 | Content::Texts => node.texts.take().unwrap(), 51 | Content::Comments => node.comments.take().unwrap(), 52 | Content::Links => urls_string(node.links.take().unwrap()), 53 | Content::Images => urls_string(node.images.take().unwrap()), 54 | Content::Inputs => node.inputs.take().unwrap(), 55 | Content::All => vec![ 56 | node.texts.take().unwrap(), 57 | node.comments.take().unwrap(), 58 | urls_string(node.links.take().unwrap()), 59 | urls_string(node.images.take().unwrap()), 60 | node.inputs.take().unwrap(), 61 | ] 62 | .into_iter() 63 | .flatten() 64 | .collect(), 65 | } 66 | } 67 | 68 | fn format_json(node: &mut Node, content: &Content) -> Vec { 69 | match content { 70 | Content::Texts => Data::json(node.texts.take().unwrap(), Content::Texts), 71 | Content::Comments => Data::json(node.comments.take().unwrap(), Content::Comments), 72 | Content::Links => Data::json(urls_string(node.links.take().unwrap()), Content::Links), 73 | Content::Images => { 74 | Data::json(urls_string(node.images.take().unwrap()), Content::Images) 75 | } 76 | Content::Inputs => Data::json(node.inputs.take().unwrap(), Content::Inputs), 77 | Content::All => vec![ 78 | Data::json(node.texts.take().unwrap(), Content::Texts), 79 | Data::json(node.comments.take().unwrap(), Content::Comments), 80 | Data::json(urls_string(node.links.take().unwrap()), Content::Links), 81 | Data::json(urls_string(node.images.take().unwrap()), Content::Images), 82 | Data::json(node.inputs.take().unwrap(), Content::Inputs), 83 | ] 84 | .into_iter() 85 | .flatten() 86 | .collect(), 87 | } 88 | } 89 | } 90 | 91 | fn urls_string(urls: Vec) -> Vec { 92 | urls.into_iter().map(|link| link.to_string()).collect() 93 | } 94 | 95 | #[derive(serde::Serialize)] 96 | struct Data { 97 | r#type: Content, 98 | content: String, 99 | } 100 | 101 | impl Data { 102 | fn json(datas: Vec, content: Content) -> Vec { 103 | datas 104 | .into_iter() 105 | .map(|data| Data { 106 | r#type: content, 107 | content: data, 108 | }) 109 | .collect() 110 | } 111 | } 112 | 113 | use std::error; 114 | use std::fmt; 115 | 116 | #[derive(Debug)] 117 | pub enum FormatError { 118 | Serde, 119 | Graph, 120 | } 121 | 122 | impl From for FormatError { 123 | fn from(_: serde_json::Error) -> Self { 124 | FormatError::Serde 125 | } 126 | } 127 | 128 | impl fmt::Display for FormatError { 129 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 130 | write!(f, "error in format data") 131 | } 132 | } 133 | 134 | impl error::Error for FormatError {} 135 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | error, process, 3 | sync::{Arc, Mutex}, 4 | }; 5 | 6 | use colored::Colorize; 7 | use tokio::{sync::Semaphore, task::JoinSet}; 8 | 9 | mod browser; 10 | mod cli; 11 | mod config; 12 | mod display; 13 | mod extract; 14 | mod format; 15 | mod graph; 16 | mod node; 17 | mod state; 18 | 19 | use browser::Browser; 20 | use config::Config; 21 | use node::Node; 22 | use state::State; 23 | 24 | static PERMITS: Semaphore = Semaphore::const_new(0); 25 | 26 | async fn run() -> Result<(), Box> { 27 | let conf = Config::new()?; 28 | let mut state = State::new(Arc::clone(&conf.root))?; 29 | println!("Crawling"); 30 | PERMITS.add_permits(conf.args.thread as usize); 31 | while state.pop_layer().is_some() { 32 | println!("=== Depth {} ===", state.current_depth); 33 | 34 | let mut handles = browse_layer(&mut state, &conf).await?; 35 | let childs = parse_layer(&mut state, &conf, &mut handles).await?; 36 | state.add_to_next_layer(childs); 37 | if state.current_depth == conf.args.depth { 38 | break; 39 | } 40 | state.current_depth += 1; 41 | println!(); 42 | } 43 | 44 | println!("Formatting"); 45 | format(&conf)?; 46 | 47 | println!("Displaying"); 48 | display(&conf)?; 49 | Ok(()) 50 | } 51 | 52 | fn format(conf: &Config) -> Result<(), Box> { 53 | let mut format = |node: &mut Node| { 54 | Node::format(node, &conf.args.content, &conf.args.cmd).map_err(Into::into) 55 | }; // Need to convert FormatError to Box< ... 56 | Node::explore(&conf.root, &mut format) 57 | } 58 | 59 | fn display(conf: &Config) -> Result<(), Box> { 60 | let mut display = |node: &mut Node| Node::display(node, &conf.args.cmd).map_err(Into::into); 61 | Node::explore(&conf.root, &mut display) 62 | } 63 | 64 | type FuturesBrowse = JoinSet<(Result, Arc>)>; 65 | 66 | // Browse the current layer and generate the chromium browser for the page 67 | async fn browse_layer( 68 | state: &mut State, 69 | config: &Config, 70 | ) -> Result> { 71 | let mut handles: FuturesBrowse = JoinSet::new(); 72 | while let Some(node) = state.current_layer.pop() { 73 | if !config.same_domain(&node.lock().unwrap().url) 74 | || state.known(&node) 75 | || !config.in_bound(&node.lock().unwrap().url) 76 | { 77 | continue; 78 | } 79 | 80 | let permit = PERMITS.acquire().await?; 81 | println!("Visiting {}", node.lock().unwrap().url.as_str().green()); 82 | handles.spawn(async move { 83 | let _permit = permit; 84 | let url = node.lock().unwrap().url.clone(); 85 | (Browser::new_navigate(&url), node) 86 | }); 87 | } 88 | Ok(handles) 89 | } 90 | 91 | // Parse every page of the layer and extract useful information 92 | // TODO: refactor that shit 93 | async fn parse_layer( 94 | state: &mut State, 95 | config: &Config, 96 | handles: &mut FuturesBrowse, 97 | ) -> Result>>, Box> { 98 | println!("Collecting data from every url of the layer"); 99 | let mut total_count = 0; 100 | let mut next_layer_childs = Vec::new(); 101 | while let Some(handle) = handles.join_next().await { 102 | let (browser, parent) = handle?; 103 | let mut explore_external = false; 104 | let links = browser?.parse_document(&config.args.content, &parent).await; 105 | 106 | let links = links.into_iter().filter_map(|link| { 107 | if config.same_domain(&link) { 108 | Some(link) 109 | } else if state.current_external < config.args.external { 110 | if !explore_external { 111 | explore_external = true; 112 | state.current_external += 1; 113 | } 114 | Some(link) 115 | } else { 116 | None 117 | } 118 | }); 119 | 120 | parent.lock().unwrap().explored = true; 121 | 122 | let mut childs: Vec>> = links 123 | .map(|url| Node::new_arc(Some(&parent), url.clone(), url.to_string())) 124 | .collect(); 125 | total_count += parent.lock().unwrap().quantity_elements() + childs.len(); 126 | next_layer_childs.append(&mut childs); 127 | } 128 | println!( 129 | "Found a total of {} {:?}", 130 | total_count.to_string().green(), 131 | config.args.cmd 132 | ); 133 | Ok(next_layer_childs) 134 | } 135 | 136 | fn main() { 137 | if let Ok(rt) = tokio::runtime::Runtime::new() { 138 | if let Err(e) = rt.block_on(run()) { 139 | eprintln!("error {:?}", e); 140 | process::exit(1); 141 | } 142 | return; 143 | } 144 | eprintln!("Error: can't start the tokio runtime"); 145 | process::exit(2); 146 | } 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coma - Website Scraper 2 | 3 | > Disclaimer: This project is currently on pause. I made some significant changes in how it's used recently (in the last merge), and it hasn't been tested much since. I plan to continue developing this tool with many great features, but for now, I am working on another project. 4 | 5 | ## Overview 6 | Coma is a lightweight command-line tool designed for scraping various types of content from web pages, such as text, comments, links, and images. Its simplicity and flexibility make it easy for users to extract the specific data they need from a given URL. 7 | 8 | ![Logo shrimp](static/shrimp.jpg) 9 | 10 | ## Installation 11 | 12 | You can install Coma either by compiling it locally after cloning the repository or by installing it directly from [crates.io](https://crates.io). 13 | 14 | ### Clone and Compile Locally 15 | 16 | 1. **Clone the repository:** 17 | ```bash 18 | git clone https://github.com/yourusername/coma.git 19 | cd coma 20 | ``` 21 | 22 | 2. **Build the project using Cargo:** 23 | ```bash 24 | cargo build --release 25 | ``` 26 | 27 | 3. **Run the compiled binary:** 28 | ```bash 29 | ./target/release/coma --help 30 | ``` 31 | 32 | ### Install from crates.io 33 | 34 | To install Coma from crates.io, use the following command: 35 | ```bash 36 | cargo install coma 37 | ``` 38 | 39 | This will download and compile Coma, making it available for easy use from the command line. 40 | 41 | ## Program Behavior 42 | 43 | ### Command Structure 44 | To use Coma, the basic command structure is as follows: 45 | 46 | ``` 47 | coma [OPTIONS] --url 48 | ``` 49 | 50 | Where `` is the website you want to scrape, and `` specifies what type of data you wish to extract. 51 | 52 | ### Commands 53 | The available commands enable you to target specific content on the web page: 54 | 55 | - **print**: Print the extracted content in the terminal. 56 | - **save**: Save the extracted content in files. 57 | - **graph**: Create an HTML topology of the website. 58 | - **help**: Displays the help menu, providing information on usage and available options 59 | 60 | ### Options 61 | 62 | Coma includes several options to customize its behavior: 63 | 64 | - `-c, --content `: Specifies the type of content to scrape. Available values are: 65 | - **texts**: Extracts the text present in the HTML of the page. 66 | - **comments**: Extracts any comments found in the HTML (such as those in HTML comment tags). 67 | - **links**: Extracts all hyperlinks from the page, allowing you to see the navigation structure or related pages. 68 | - **images**: Extracts the URLs of images present on the page. 69 | - **inputs**: Extracts input fields from forms on the page. 70 | - **all**: Extracts all the available types of content. (Default: all) 71 | 72 | - `-u, --url `: Mandatory option to specify the URL to start the scraping process. 73 | - `-d, --depth `: Determines how deep the scraper should go from the specified URL: 74 | - `0`: Scrapes only the specified URL. 75 | - `<0`: Enables infinite depth, allowing the scraper to traverse through all linked pages. 76 | - Default is `0`. 77 | 78 | - `-b, --bound `: Sets a filter to include only URLs containing a specific substring. This can be useful for limiting the scraping to a specific domain or section of a website. The default value is an empty string, meaning no filtering is applied. 79 | - `-t, --task `: Sets the maximum number of concurrent asynchronous tasks to be made during scraping. The default is set to 5, which balances speed and performance without overwhelming the target server. 80 | - `-e, --external `: Specifies whether to include external links or not. Default is 0 (exclude external links). 81 | - `-h, --help`: Prints the help menu for Coma, including usage instructions and command options. 82 | - `-V, --version`: Displays the current version of Coma. 83 | 84 | ## Plan for the Future 85 | 86 | ### Topology 87 | 88 | The current graph doesn't give the possibility to make directed link which would be great 89 | 90 | I aim to provide the complete topology of the website based on different heuristics: 91 | - Hierarchy of the website. 92 | - Discovery from the provided link using BFS (Breadth-First Search) and DFS (Depth-First Search). 93 | 94 | ### Content 95 | We could add more command options beyond the current selection: 96 | - Full HTML page 97 | - Regex patterns inside the texts with some useful preset 98 | - More html tag 99 | 100 | ### Options 101 | It's important to improve the usability of the tool with these options: 102 | - Output of different formats, it would be useful to have CSV, JSON, and maybe more. 103 | - Proxy 104 | - Cookies and header 105 | - Download the images directly 106 | 107 | ## Conclusion 108 | Coma is a flexible and straightforward tool for anyone needing to scrape data from websites quickly. Users can easily customize their scraping experience through various commands and options, making it suitable for a wide range of web data extraction tasks. 109 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Network Graph in JavaScript 6 | 91 | 92 | 93 | 94 | 95 | 96 |
97 |
98 |

Node Info

99 |
100 |
101 | 217 | 218 | 219 | 220 | --------------------------------------------------------------------------------