├── .envrc
├── .gitignore
├── static
    └── shrimp.jpg
├── tailwind.config.js
├── flake.lock
├── flake.nix
├── Cargo.toml
├── LICENSE
├── src
    ├── display.rs
    ├── state.rs
    ├── config.rs
    ├── node.rs
    ├── extract.rs
    ├── cli.rs
    ├── browser.rs
    ├── graph.rs
    ├── format.rs
    └── main.rs
├── README.md
└── templates
    └── index.html


/.envrc:
--------------------------------------------------------------------------------
1 | use flake
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | debug/
2 | target/
3 | 
4 | Cargo.lock
5 | 
6 | **/*.rs.bk
7 | 
8 | *.pdb
9 | 


--------------------------------------------------------------------------------
/static/shrimp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noahfraiture/coma/HEAD/static/shrimp.jpg


--------------------------------------------------------------------------------
/tailwind.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('tailwindcss').Config} */
 2 | module.exports = {
 3 |   darkMode: "media", // Enables dark mode based on the user's system preference
 4 |   theme: {
 5 |     extend: {},
 6 |   },
 7 |   variants: {
 8 |     extend: {},
 9 |   },
10 |   plugins: [],
11 | };
12 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nodes": {
 3 |     "nixpkgs": {
 4 |       "locked": {
 5 |         "lastModified": 1738142207,
 6 |         "narHash": "sha256-NGqpVVxNAHwIicXpgaVqJEJWeyqzoQJ9oc8lnK9+WC4=",
 7 |         "owner": "nixos",
 8 |         "repo": "nixpkgs",
 9 |         "rev": "9d3ae807ebd2981d593cddd0080856873139aa40",
10 |         "type": "github"
11 |       },
12 |       "original": {
13 |         "owner": "nixos",
14 |         "ref": "nixos-unstable",
15 |         "repo": "nixpkgs",
16 |         "type": "github"
17 |       }
18 |     },
19 |     "root": {
20 |       "inputs": {
21 |         "nixpkgs": "nixpkgs"
22 |       }
23 |     }
24 |   },
25 |   "root": "root",
26 |   "version": 7
27 | }
28 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   inputs = {
 3 |     nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
 4 |   };
 5 | 
 6 |   outputs =
 7 |     { self, nixpkgs }:
 8 |     let
 9 |       pkgs = import nixpkgs {
10 |         system = "x86_64-linux";
11 |         config.allowUnfree = true;
12 |       };
13 |     in
14 |     {
15 |       devShells."x86_64-linux".default = pkgs.mkShell {
16 | 
17 |         buildInputs = with pkgs; [
18 |           pkg-config
19 |         ];
20 | 
21 |         packages = with pkgs; [
22 |           cargo
23 |           rustc
24 |           openssl
25 |         ];
26 | 
27 |         shellHook = ''
28 |           export DIRENV='rust'
29 |         '';
30 |       };
31 |     };
32 | }
33 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "coma"
 3 | version = "0.2.3"
 4 | edition = "2021"
 5 | authors = ["Noah <pro@noahcode.dev>"]
 6 | license = "MIT"
 7 | description = "Coma is a lightweight command-line tool designed for crawling websites"
 8 | repository = "https://github.com/noahfraiture/coma"
 9 | readme = "README.md"
10 | 
11 | keywords = ["scraping", "crawler", "web-discovery"]
12 | categories = ["web-programming"]
13 | 
14 | [dependencies]
15 | anyhow = "1.0.86"
16 | askama = { version = "0.12.1", features = ["serde-json"] }
17 | chrono = "0.4.38"
18 | clap = { version = "4.5.11", features = ["derive"] }
19 | colored = "2.1.0"
20 | futures = "0.3.30"
21 | headless_chrome = { version = "1.0.12", features = ["fetch"] }
22 | markup5ever = "0.12.1"
23 | reqwest = { version = "0.12.5", features = ["blocking"] }
24 | scraper = "0.19.1"
25 | serde = { version = "1.0.204", features = ["derive", "rc"] }
26 | serde_json = "1.0.127"
27 | tempfile = "3.11.0"
28 | tokio = { version = "1.39.2", features = ["full"] }
29 | url = "2.5.2"
30 | webbrowser = "1.0.1"
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Noah Fraiture
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/display.rs:
--------------------------------------------------------------------------------
 1 | use crate::cli::{Display, Format};
 2 | use crate::graph;
 3 | use crate::node::Node;
 4 | 
 5 | impl Node {
 6 |     pub fn display(node: &mut Node, cmd: &Display) -> std::result::Result<(), CommandError> {
 7 |         match cmd {
 8 |             Display::Print { format: _ } => {
 9 |                 println!("{}", node.output.as_ref().unwrap())
10 |             }
11 |             Display::Save { format, name } => {
12 |                 let output = node.output.as_ref().unwrap();
13 |                 let extension = match format {
14 |                     Format::Json => "json",
15 |                     Format::Raw => "txt",
16 |                 };
17 | 
18 |                 let file_name = format!("{name}.{extension}");
19 |                 let path = Path::new(&file_name);
20 | 
21 |                 let mut file = File::create(path)?;
22 |                 file.write_all(output.as_bytes())?;
23 |             }
24 |             Display::Graph => graph::render(node)?,
25 |         }
26 |         Ok(())
27 |     }
28 | }
29 | 
30 | use std::error;
31 | use std::fmt;
32 | use std::fs::File;
33 | use std::io::Write;
34 | use std::path::Path;
35 | 
36 | #[derive(Debug)]
37 | pub enum CommandError {
38 |     Graph,
39 |     IO(std::io::Error),
40 | }
41 | 
42 | impl fmt::Display for CommandError {
43 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
44 |         write!(f, "error in command data")
45 |     }
46 | }
47 | 
48 | impl From<graph::GraphError> for CommandError {
49 |     fn from(_: graph::GraphError) -> Self {
50 |         CommandError::Graph
51 |     }
52 | }
53 | 
54 | impl From<std::io::Error> for CommandError {
55 |     fn from(value: std::io::Error) -> Self {
56 |         CommandError::IO(value)
57 |     }
58 | }
59 | 
60 | impl error::Error for CommandError {}
61 | 


--------------------------------------------------------------------------------
/src/state.rs:
--------------------------------------------------------------------------------
 1 | use crate::node::Node;
 2 | use std::{
 3 |     collections::LinkedList,
 4 |     sync::{Arc, Mutex},
 5 | };
 6 | 
 7 | pub struct State {
 8 |     pub current_depth: i32,
 9 |     pub current_external: i32,
10 |     visited: Vec<Arc<Mutex<Node>>>,
11 |     layers: LinkedList<Vec<Arc<Mutex<Node>>>>,
12 |     pub current_layer: Vec<Arc<Mutex<Node>>>,
13 | }
14 | 
15 | impl State {
16 |     pub fn new(root: Arc<Mutex<Node>>) -> Result<Self, Box<dyn std::error::Error>> {
17 |         // Queue of vector of the discovered link at the current depth
18 |         // Each node of the linkedlist is a depth
19 |         let mut layers: LinkedList<Vec<Arc<Mutex<Node>>>> = LinkedList::new();
20 |         layers.push_back(vec![root]);
21 | 
22 |         // This will never be used and could be None
23 |         let current_layer = Vec::new();
24 | 
25 |         Ok(State {
26 |             current_depth: 0,
27 |             current_external: 0,
28 |             visited: Vec::new(),
29 |             layers,
30 |             current_layer,
31 |         })
32 |     }
33 | 
34 |     pub fn pop_layer(&mut self) -> Option<()> {
35 |         self.current_layer = self.layers.pop_front()?;
36 |         Some(())
37 |     }
38 | 
39 |     pub fn add_to_next_layer(&mut self, links: Vec<Arc<Mutex<Node>>>) {
40 |         if self.layers.front().is_none() {
41 |             self.layers.push_back(Vec::new());
42 |         }
43 |         self.layers.front_mut().unwrap().extend(links);
44 |     }
45 | 
46 |     pub fn known(&mut self, node: &Arc<Mutex<Node>>) -> bool {
47 |         for visited_node in self.visited.iter() {
48 |             if visited_node.lock().unwrap().eq(&node.lock().unwrap()) {
49 |                 return true;
50 |             }
51 |         }
52 |         self.visited.push(Arc::clone(node));
53 |         false
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::sync::{Arc, Mutex};
 3 | 
 4 | use crate::cli;
 5 | use crate::node::Node;
 6 | use colored::Colorize;
 7 | use url::Url;
 8 | 
 9 | pub struct Config {
10 |     pub domain: String,
11 |     pub root: Arc<Mutex<Node>>,
12 |     pub args: cli::Cli,
13 | }
14 | 
15 | impl Config {
16 |     pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
17 |         let args = cli::args()?;
18 | 
19 |         // NOTE: browser must still exist or the connection is closed. Pretty weird to not have
20 |         let origin_url = Url::parse(&args.url).map_err(|e| ConfigError::Message(e.to_string()))?;
21 |         origin_url
22 |             .domain()
23 |             .ok_or("Url doesn't have a domain")
24 |             .map_err(|e| ConfigError::Message(e.to_string()))?;
25 |         let domain = origin_url.domain().unwrap().to_owned();
26 |         let id = origin_url.clone().to_string();
27 | 
28 |         Ok(Config {
29 |             domain,
30 |             root: Node::new_arc(None, origin_url, id),
31 |             args,
32 |         })
33 |     }
34 | 
35 |     pub fn same_domain(&self, url: &Url) -> bool {
36 |         url.domain().unwrap_or("") == self.domain
37 |     }
38 | 
39 |     pub fn in_bound(&self, url: &Url) -> bool {
40 |         url.as_str().contains(&self.args.bound)
41 |     }
42 | }
43 | 
44 | pub enum ConfigError {
45 |     Message(String),
46 | }
47 | 
48 | impl ConfigError {
49 |     fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
50 |         match self {
51 |             ConfigError::Message(s) => {
52 |                 write!(f, "{}: {}", "Config error".red(), s)
53 |             }
54 |         }
55 |     }
56 | }
57 | 
58 | impl fmt::Display for ConfigError {
59 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60 |         self.print(f)
61 |     }
62 | }
63 | 
64 | impl fmt::Debug for ConfigError {
65 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66 |         self.print(f)
67 |     }
68 | }
69 | 
70 | impl std::error::Error for ConfigError {}
71 | 


--------------------------------------------------------------------------------
/src/node.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::{Arc, Mutex, Weak};
 2 | use url::Url;
 3 | 
 4 | pub struct Node {
 5 |     pub id: String,
 6 |     pub url: Url,
 7 |     // TODO : should I move while Node behind of mutex instead of most field ?
 8 |     // TODO : I could use a color to show difference between explored and unexplored node
 9 |     pub explored: bool, // flag used to know if it will be rendered
10 |     // Every node will own every images on the page
11 |     // More logic that every node own a copy of the url to the image
12 | 
13 |     // Mutex is need to borrow mutability of Arc
14 |     pub images: Option<Vec<Url>>,
15 |     // Can't directly use scraper::node::{Comment, Text} since their aren't Send/Sync
16 |     // Could try later to impl these trait
17 |     pub comments: Option<Vec<String>>,
18 |     pub texts: Option<Vec<String>>,
19 |     pub inputs: Option<Vec<String>>,
20 |     pub links: Option<Vec<Url>>,
21 |     pub children: Vec<Arc<Mutex<Node>>>,
22 |     pub parents: Vec<Weak<Mutex<Node>>>,
23 | 
24 |     // Output already formatted as wanted
25 |     pub output: Option<String>,
26 | }
27 | 
28 | impl std::hash::Hash for Node {
29 |     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
30 |         self.id.hash(state);
31 |         self.url.hash(state);
32 |     }
33 | }
34 | 
35 | impl PartialEq for Node {
36 |     fn eq(&self, other: &Self) -> bool {
37 |         other.id == self.id && other.url == self.url
38 |     }
39 | }
40 | 
41 | impl Eq for Node {}
42 | 
43 | impl Node {
44 |     pub fn new_arc(parent: Option<&Arc<Mutex<Node>>>, url: Url, id: String) -> Arc<Mutex<Node>> {
45 |         let node = Arc::new(Mutex::new(Node {
46 |             id,
47 |             url,
48 |             explored: false,
49 |             images: None,
50 |             comments: None,
51 |             texts: None,
52 |             inputs: None,
53 |             links: None,
54 |             children: vec![],
55 |             parents: parent.map_or_else(Vec::new, |p| vec![Arc::downgrade(p)]),
56 |             output: None,
57 |         }));
58 |         if let Some(parent) = parent {
59 |             parent.lock().unwrap().add_child(&node);
60 |         };
61 |         node
62 |     }
63 | 
64 |     pub fn explore(
65 |         node: &Arc<Mutex<Node>>,
66 |         func: Visitor,
67 |     ) -> Result<(), Box<dyn std::error::Error>> {
68 |         func(&mut node.lock().unwrap())?;
69 |         for child in &node.lock().unwrap().children {
70 |             if !child.lock().unwrap().explored {
71 |                 continue;
72 |             }
73 |             Node::explore(child, func)?;
74 |         }
75 |         Ok(())
76 |     }
77 | 
78 |     pub fn add_child(&mut self, child: &Arc<Mutex<Node>>) {
79 |         self.children.push(Arc::clone(child))
80 |     }
81 | 
82 |     pub fn quantity_elements(&self) -> usize {
83 |         self.images.as_ref().unwrap_or(&vec![]).len()
84 |             + self.comments.as_ref().unwrap_or(&vec![]).len()
85 |             + self.texts.as_ref().unwrap_or(&vec![]).len()
86 |     }
87 | }
88 | 
89 | type Visitor<'a> = &'a mut dyn FnMut(&mut Node) -> Result<(), Box<dyn std::error::Error>>;
90 | 


--------------------------------------------------------------------------------
/src/extract.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     collections::HashSet,
  3 |     sync::{Arc, Mutex},
  4 | };
  5 | 
  6 | use markup5ever::local_name;
  7 | use scraper::{node::Element, Html};
  8 | use url::Url;
  9 | 
 10 | use crate::node;
 11 | 
 12 | // TODO : add format
 13 | pub fn extract_links(url: &Url, page: &Html) -> HashSet<Url> {
 14 |     HashSet::from_iter(page.tree.values().filter_map(|v| match v {
 15 |         scraper::Node::Element(element) => {
 16 |             let element = element.to_owned();
 17 | 
 18 |             // Ensure this is a link
 19 |             if !matches!(element.name.local, local_name!("a")) {
 20 |                 return None;
 21 |             }
 22 | 
 23 |             // We want the attribute "href"
 24 |             for (key, value) in &element.attrs {
 25 |                 if matches!(key.local, local_name!("href")) {
 26 |                     return Url::join(url, value).ok();
 27 |                 }
 28 |             }
 29 |             None
 30 |         }
 31 |         _ => None,
 32 |     }))
 33 | }
 34 | 
 35 | pub fn extract_comments(node: &Arc<Mutex<node::Node>>, page: &Html) {
 36 |     node.lock().unwrap().comments = Some(
 37 |         page.tree
 38 |             .values()
 39 |             .filter_map(|v| match v {
 40 |                 scraper::Node::Comment(comment) => {
 41 |                     Some(comment.to_string()).filter(|v| !v.is_empty())
 42 |                 }
 43 |                 _ => None,
 44 |             })
 45 |             .collect(),
 46 |     );
 47 | }
 48 | 
 49 | pub fn extract_texts(node: &Arc<Mutex<node::Node>>, page: &Html) {
 50 |     node.lock().unwrap().texts = Some(
 51 |         page.tree
 52 |             .values()
 53 |             .filter_map(|v| match v {
 54 |                 scraper::Node::Text(text) => Some(text.to_string()),
 55 |                 _ => None,
 56 |             })
 57 |             .collect(),
 58 |     );
 59 | }
 60 | 
 61 | pub fn extract_element<T, F>(page: &Html, filter: F) -> Vec<T>
 62 | where
 63 |     F: Fn(Element) -> Option<T>,
 64 | {
 65 |     page.tree
 66 |         .values()
 67 |         .filter_map(|v| match v {
 68 |             scraper::Node::Element(element) => {
 69 |                 let element = element.to_owned();
 70 |                 filter(element)
 71 |             }
 72 |             _ => None,
 73 |         })
 74 |         .collect()
 75 | }
 76 | 
 77 | pub fn extract_images(node: &Arc<Mutex<node::Node>>, page: &Html) {
 78 |     node.lock().unwrap().images = Some(extract_element(page, |element: Element| {
 79 |         if matches!(element.name.local, local_name!("img")) {
 80 |             for (key, value) in &element.attrs {
 81 |                 if matches!(key.local, local_name!("src")) {
 82 |                     // If the url is absolute, the value will replace the base url
 83 |                     return Url::join(&node.lock().unwrap().url, value).ok();
 84 |                 }
 85 |             }
 86 |         }
 87 |         None
 88 |     }));
 89 | }
 90 | 
 91 | // TODO : better rendering
 92 | pub fn extract_input(node: &Arc<Mutex<node::Node>>, page: &Html) {
 93 |     node.lock().unwrap().inputs = Some(extract_element(page, |element: Element| {
 94 |         if matches!(element.name.local, local_name!("input")) {
 95 |             Some(format!("{:?}", element))
 96 |         } else {
 97 |             None
 98 |         }
 99 |     }));
100 | }
101 | 


--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
  1 | use core::fmt;
  2 | 
  3 | use clap::{Parser, Subcommand};
  4 | use colored::Colorize;
  5 | use url::Url;
  6 | 
  7 | /// Website scraper
  8 | #[derive(Parser, Debug)]
  9 | #[command(name = "Coma")]
 10 | #[command(author = "Noah")]
 11 | #[command(version)]
 12 | #[command(help_template = "
 13 | {name} - {about}
 14 | 
 15 | Author: {author}
 16 | Version: {version}
 17 | 
 18 | {usage-heading} {usage}
 19 | {all-args} {tab}")]
 20 | pub struct Cli {
 21 |     /// Action to perform with the data
 22 |     #[command(subcommand)]
 23 |     pub cmd: Display,
 24 | 
 25 |     /// Content to scrap
 26 |     #[arg(short, long, value_delimiter = ',', default_value = "all")]
 27 |     pub content: Vec<Content>,
 28 | 
 29 |     /// Url to start the search
 30 |     #[arg(short, long)]
 31 |     pub url: String,
 32 | 
 33 |     /// Depth to search from the given url, 0 for only the current url, < 0 for infinite depth
 34 |     #[arg(short, long, default_value_t = 0, allow_negative_numbers = true)]
 35 |     pub depth: i32,
 36 | 
 37 |     /// Upper bound in the url, any url that doesn't contains this string will be ignored
 38 |     // TODO : change default to Option<String>
 39 |     #[arg(short, long, default_value = "")]
 40 |     pub bound: String,
 41 | 
 42 |     /// Max number of concurrent thread
 43 |     #[arg(short, long, default_value_t = 5)]
 44 |     pub thread: u32,
 45 | 
 46 |     // Depth to external website with different domain. Depth have priority to stop the search
 47 |     #[arg(short, long, default_value_t = 0)]
 48 |     pub external: i32,
 49 | }
 50 | 
 51 | #[derive(Subcommand, Debug, Clone, PartialEq, Eq, Hash)]
 52 | pub enum Display {
 53 |     /// Print the extracted content in the terminal
 54 |     Print {
 55 |         /// Format of the output
 56 |         #[arg()]
 57 |         format: Format,
 58 |     },
 59 | 
 60 |     /// Save the extracted content in files
 61 |     Save {
 62 |         /// Format of the output
 63 |         #[arg()]
 64 |         format: Format,
 65 | 
 66 |         /// Name of the output file
 67 |         #[arg(short, long, default_value = "output")]
 68 |         name: String,
 69 |     },
 70 | 
 71 |     /// Create a html topolgy
 72 |     Graph,
 73 | }
 74 | 
 75 | #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, serde::Serialize)]
 76 | pub enum Content {
 77 |     /// Extract the text in the html
 78 |     Texts,
 79 | 
 80 |     /// Extract the comments in the html
 81 |     Comments,
 82 | 
 83 |     /// Extract the links found on the page
 84 |     Links,
 85 | 
 86 |     /// Extract the images of the page
 87 |     Images,
 88 | 
 89 |     /// Extract informations about any form
 90 |     Inputs,
 91 | 
 92 |     /// Extract all information and generate a topology
 93 |     All,
 94 | }
 95 | 
 96 | #[derive(clap::ValueEnum, Debug, Clone, PartialEq, Eq, Hash)]
 97 | pub enum Format {
 98 |     /// Create a json file with the data
 99 |     Json,
100 | 
101 |     /// Raw data. Link are not raw href but joined with domain
102 |     Raw,
103 | }
104 | 
105 | pub enum ArgsError {
106 |     InvalidUrl(String),
107 | }
108 | 
109 | impl ArgsError {
110 |     fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
111 |         match self {
112 |             ArgsError::InvalidUrl(url) => write!(f, "{}: {}", "Invalid URL".red(), url),
113 |         }
114 |     }
115 | }
116 | 
117 | impl fmt::Display for ArgsError {
118 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
119 |         self.print(f)
120 |     }
121 | }
122 | 
123 | impl fmt::Debug for ArgsError {
124 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
125 |         self.print(f)
126 |     }
127 | }
128 | 
129 | impl std::error::Error for ArgsError {}
130 | 
131 | pub fn args() -> Result<Cli, ArgsError> {
132 |     let args = Cli::parse();
133 | 
134 |     match Url::parse(&args.url) {
135 |         Ok(v) => {
136 |             v.domain().ok_or(ArgsError::InvalidUrl(v.to_string()))?;
137 |             Ok(args)
138 |         }
139 |         Err(e) => Err(ArgsError::InvalidUrl(e.to_string())),
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/browser.rs:
--------------------------------------------------------------------------------
  1 | use colored::Colorize;
  2 | use std::fmt;
  3 | use std::sync::Arc;
  4 | 
  5 | use anyhow::Result;
  6 | use headless_chrome::LaunchOptions;
  7 | use std::{collections::HashSet, sync::Mutex};
  8 | 
  9 | use scraper::Html;
 10 | use url::Url;
 11 | 
 12 | use crate::cli::Content;
 13 | use crate::extract;
 14 | use crate::node;
 15 | 
 16 | pub struct Browser {
 17 |     #[allow(dead_code)] // need to keep the browser alive
 18 |     browser: headless_chrome::Browser,
 19 |     pub tab: Arc<headless_chrome::Tab>,
 20 | }
 21 | 
 22 | impl Browser {
 23 |     // These functions are used in async context
 24 |     // The separation of function is needed to send connection in async
 25 |     // task, but the Html can't be sent accros async task
 26 |     pub fn new_navigate(url: &Url) -> Result<Self, BrowseError> {
 27 |         let browser = headless_chrome::Browser::new(
 28 |             LaunchOptions::default_builder()
 29 |                 .devtools(false)
 30 |                 .build()
 31 |                 .map_err(|e| BrowseError::Browser(e.to_string()))?,
 32 |         )?;
 33 |         let tab = browser.new_tab()?;
 34 |         tab.navigate_to(url.as_str())?;
 35 |         tab.wait_until_navigated()?;
 36 |         Ok(Self { browser, tab })
 37 |     }
 38 | 
 39 |     // TODO : replace handle_... by the command and format
 40 |     // Add format
 41 |     // Extract useful information
 42 |     pub async fn parse_document(
 43 |         self,
 44 |         contents: &Vec<Content>,
 45 |         node: &Arc<Mutex<node::Node>>,
 46 |     ) -> HashSet<Url> {
 47 |         let response = self.tab.get_content().unwrap();
 48 |         let document = Html::parse_document(&response);
 49 |         let links = extract::extract_links(&node.lock().unwrap().url, &document);
 50 | 
 51 |         for content in contents {
 52 |             match content {
 53 |                 Content::Texts => {
 54 |                     extract::extract_texts(node, &document);
 55 |                 }
 56 |                 Content::Comments => {
 57 |                     extract::extract_comments(node, &document);
 58 |                 }
 59 |                 Content::Links => {
 60 |                     // NOTE : here we must clone even if normally we shouldn't have
 61 |                     // twice the same content
 62 |                     node.lock().unwrap().links = Some(links.clone().into_iter().collect());
 63 |                 }
 64 |                 Content::Images => {
 65 |                     extract::extract_images(node, &document);
 66 |                 }
 67 |                 Content::Inputs => {
 68 |                     extract::extract_input(node, &document);
 69 |                 }
 70 |                 Content::All => {
 71 |                     extract::extract_texts(node, &document);
 72 |                     extract::extract_comments(node, &document);
 73 |                     node.lock().unwrap().links = Some(links.clone().into_iter().collect());
 74 |                     extract::extract_images(node, &document);
 75 |                     extract::extract_input(node, &document);
 76 |                 }
 77 |             };
 78 |         }
 79 |         links
 80 |     }
 81 | }
 82 | 
 83 | // NOTE: this is ok only because the browser is the only one using anyhow
 84 | pub enum BrowseError {
 85 |     Browser(String),
 86 | }
 87 | 
 88 | impl BrowseError {
 89 |     fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 90 |         match self {
 91 |             BrowseError::Browser(e) => write!(f, "{}: {}", "Browser error".red(), e),
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | impl fmt::Display for BrowseError {
 97 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 98 |         self.print(f)
 99 |     }
100 | }
101 | 
102 | impl fmt::Debug for BrowseError {
103 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |         self.print(f)
105 |     }
106 | }
107 | 
108 | impl std::error::Error for BrowseError {}
109 | 
110 | impl From<anyhow::Error> for BrowseError {
111 |     fn from(value: anyhow::Error) -> Self {
112 |         BrowseError::Browser(value.to_string())
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/graph.rs:
--------------------------------------------------------------------------------
  1 | use askama::Template;
  2 | use colored::Colorize;
  3 | use serde::Serialize;
  4 | use std::{collections::HashSet, fmt, fs};
  5 | 
  6 | use crate::node::Node;
  7 | 
  8 | #[derive(Template)]
  9 | #[template(path = "index.html")]
 10 | struct GraphTemplate {
 11 |     graph: Graph,
 12 | }
 13 | 
 14 | #[derive(Serialize, Debug, Clone)]
 15 | struct Graph {
 16 |     nodes: HashSet<GraphNode>,
 17 |     edges: HashSet<GraphEdge>,
 18 | }
 19 | 
 20 | // TODO: If i add more information, it could be great to implement
 21 | // (PartialEq, Eq, Hash) to have accurate hashset
 22 | #[derive(Serialize, Debug, Clone, Eq, PartialEq, Hash)]
 23 | struct GraphNode {
 24 |     id: String,
 25 |     label: String,
 26 |     images: Vec<String>,
 27 |     comments: Vec<String>,
 28 |     inputs: Vec<String>,
 29 | }
 30 | 
 31 | impl GraphNode {
 32 |     fn from_node(node: &Node) -> Self {
 33 |         Self {
 34 |             id: node.id.clone(),
 35 |             label: node.url.to_string(),
 36 |             images: node
 37 |                 .images
 38 |                 .as_deref()
 39 |                 .unwrap_or_default()
 40 |                 .iter()
 41 |                 .map(|url| url.to_string())
 42 |                 .collect(),
 43 |             comments: node.comments.as_deref().unwrap_or_default().to_vec(),
 44 |             inputs: node.inputs.as_deref().unwrap_or_default().to_vec(),
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | #[derive(Serialize, Debug, Clone, Eq, PartialEq, Hash)]
 50 | struct GraphEdge {
 51 |     from: String,
 52 |     to: String,
 53 | }
 54 | 
 55 | impl Graph {
 56 |     fn from_root(node: &Node) -> Self {
 57 |         let (mut nodes, mut edges) = (HashSet::<GraphNode>::new(), HashSet::<GraphEdge>::new());
 58 |         let graph_child = Graph::by_children(node);
 59 |         nodes.extend(graph_child.nodes);
 60 |         edges.extend(graph_child.edges);
 61 |         Graph { nodes, edges }
 62 |     }
 63 | 
 64 |     fn by_children(node: &Node) -> Self {
 65 |         let (mut nodes, mut edges) = (
 66 |             HashSet::from([GraphNode::from_node(node)]),
 67 |             HashSet::<GraphEdge>::new(),
 68 |         );
 69 |         for child in node.children.clone() {
 70 |             if !child.lock().unwrap().explored {
 71 |                 continue;
 72 |             }
 73 |             edges.insert(GraphEdge {
 74 |                 from: node.id.clone(),
 75 |                 to: child.lock().unwrap().id.clone(),
 76 |             });
 77 |             let graph = Graph::by_children(&child.lock().unwrap());
 78 |             nodes.extend(graph.nodes);
 79 |             edges.extend(graph.edges);
 80 |         }
 81 |         Graph { nodes, edges }
 82 |     }
 83 | }
 84 | 
 85 | // NOTE : we suppose we search from the root and thus we will never need to look at parents
 86 | // If we want to support multiple root, we'll have to rethink this
 87 | pub fn render(root: &Node) -> Result<(), GraphError> {
 88 |     let template = GraphTemplate {
 89 |         graph: Graph::from_root(root),
 90 |     };
 91 |     let html = template.render().map_err(|e| GraphError(e.to_string()))?;
 92 |     let mut temp_file_path = std::env::temp_dir();
 93 |     temp_file_path.push(root.url.domain().unwrap().to_owned() + ".html");
 94 |     fs::write(&temp_file_path, html).expect("Failed to write to named file");
 95 |     let temp_file_path_str = temp_file_path.to_str().expect("Failed to get file path");
 96 |     webbrowser::open(temp_file_path_str).expect("Failed to open in web browser");
 97 |     Ok(())
 98 | }
 99 | 
100 | pub struct GraphError(String);
101 | 
102 | impl GraphError {
103 |     fn print(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |         write!(f, "{}: {}", "Graph error".red(), self.0)
105 |     }
106 | }
107 | 
108 | impl fmt::Display for GraphError {
109 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 |         self.print(f)
111 |     }
112 | }
113 | 
114 | impl fmt::Debug for GraphError {
115 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
116 |         self.print(f)
117 |     }
118 | }
119 | 
120 | impl std::error::Error for GraphError {}
121 | 


--------------------------------------------------------------------------------
/src/format.rs:
--------------------------------------------------------------------------------
  1 | use url::Url;
  2 | 
  3 | use crate::cli::{Content, Display, Format};
  4 | 
  5 | use super::node::Node;
  6 | 
  7 | impl Node {
  8 |     pub fn format(
  9 |         node: &mut Node,
 10 |         contents: &Vec<Content>,
 11 |         cmd: &Display,
 12 |     ) -> std::result::Result<(), FormatError> {
 13 |         let format = match cmd {
 14 |             Display::Print { format } | Display::Save { format, .. } => format,
 15 |             _ => return Err(FormatError::Graph),
 16 |         };
 17 | 
 18 |         match format {
 19 |             Format::Json => Node::aggregate_json(node, contents),
 20 |             Format::Raw => Node::aggregate_raw(node, contents),
 21 |         }
 22 |     }
 23 | 
 24 |     fn aggregate_json(
 25 |         node: &mut Node,
 26 |         contents: &Vec<Content>,
 27 |     ) -> std::result::Result<(), FormatError> {
 28 |         let mut datas: Vec<Data> = Vec::new();
 29 |         for content in contents {
 30 |             datas.append(&mut Self::format_json(node, content));
 31 |         }
 32 |         node.output = serde_json::to_string(&datas).ok();
 33 |         Ok(())
 34 |     }
 35 | 
 36 |     fn aggregate_raw(
 37 |         node: &mut Node,
 38 |         contents: &Vec<Content>,
 39 |     ) -> std::result::Result<(), FormatError> {
 40 |         let mut datas: Vec<String> = Vec::new();
 41 |         for content in contents {
 42 |             datas.append(&mut Self::format_raw(node, content))
 43 |         }
 44 |         node.output = Some(datas.join("\n"));
 45 |         Ok(())
 46 |     }
 47 | 
 48 |     fn format_raw(node: &mut Node, content: &Content) -> Vec<String> {
 49 |         match content {
 50 |             Content::Texts => node.texts.take().unwrap(),
 51 |             Content::Comments => node.comments.take().unwrap(),
 52 |             Content::Links => urls_string(node.links.take().unwrap()),
 53 |             Content::Images => urls_string(node.images.take().unwrap()),
 54 |             Content::Inputs => node.inputs.take().unwrap(),
 55 |             Content::All => vec![
 56 |                 node.texts.take().unwrap(),
 57 |                 node.comments.take().unwrap(),
 58 |                 urls_string(node.links.take().unwrap()),
 59 |                 urls_string(node.images.take().unwrap()),
 60 |                 node.inputs.take().unwrap(),
 61 |             ]
 62 |             .into_iter()
 63 |             .flatten()
 64 |             .collect(),
 65 |         }
 66 |     }
 67 | 
 68 |     fn format_json(node: &mut Node, content: &Content) -> Vec<Data> {
 69 |         match content {
 70 |             Content::Texts => Data::json(node.texts.take().unwrap(), Content::Texts),
 71 |             Content::Comments => Data::json(node.comments.take().unwrap(), Content::Comments),
 72 |             Content::Links => Data::json(urls_string(node.links.take().unwrap()), Content::Links),
 73 |             Content::Images => {
 74 |                 Data::json(urls_string(node.images.take().unwrap()), Content::Images)
 75 |             }
 76 |             Content::Inputs => Data::json(node.inputs.take().unwrap(), Content::Inputs),
 77 |             Content::All => vec![
 78 |                 Data::json(node.texts.take().unwrap(), Content::Texts),
 79 |                 Data::json(node.comments.take().unwrap(), Content::Comments),
 80 |                 Data::json(urls_string(node.links.take().unwrap()), Content::Links),
 81 |                 Data::json(urls_string(node.images.take().unwrap()), Content::Images),
 82 |                 Data::json(node.inputs.take().unwrap(), Content::Inputs),
 83 |             ]
 84 |             .into_iter()
 85 |             .flatten()
 86 |             .collect(),
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | fn urls_string(urls: Vec<Url>) -> Vec<String> {
 92 |     urls.into_iter().map(|link| link.to_string()).collect()
 93 | }
 94 | 
 95 | #[derive(serde::Serialize)]
 96 | struct Data {
 97 |     r#type: Content,
 98 |     content: String,
 99 | }
100 | 
101 | impl Data {
102 |     fn json(datas: Vec<String>, content: Content) -> Vec<Data> {
103 |         datas
104 |             .into_iter()
105 |             .map(|data| Data {
106 |                 r#type: content,
107 |                 content: data,
108 |             })
109 |             .collect()
110 |     }
111 | }
112 | 
113 | use std::error;
114 | use std::fmt;
115 | 
116 | #[derive(Debug)]
117 | pub enum FormatError {
118 |     Serde,
119 |     Graph,
120 | }
121 | 
122 | impl From<serde_json::Error> for FormatError {
123 |     fn from(_: serde_json::Error) -> Self {
124 |         FormatError::Serde
125 |     }
126 | }
127 | 
128 | impl fmt::Display for FormatError {
129 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130 |         write!(f, "error in format data")
131 |     }
132 | }
133 | 
134 | impl error::Error for FormatError {}
135 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     error, process,
  3 |     sync::{Arc, Mutex},
  4 | };
  5 | 
  6 | use colored::Colorize;
  7 | use tokio::{sync::Semaphore, task::JoinSet};
  8 | 
  9 | mod browser;
 10 | mod cli;
 11 | mod config;
 12 | mod display;
 13 | mod extract;
 14 | mod format;
 15 | mod graph;
 16 | mod node;
 17 | mod state;
 18 | 
 19 | use browser::Browser;
 20 | use config::Config;
 21 | use node::Node;
 22 | use state::State;
 23 | 
 24 | static PERMITS: Semaphore = Semaphore::const_new(0);
 25 | 
 26 | async fn run() -> Result<(), Box<dyn std::error::Error>> {
 27 |     let conf = Config::new()?;
 28 |     let mut state = State::new(Arc::clone(&conf.root))?;
 29 |     println!("Crawling");
 30 |     PERMITS.add_permits(conf.args.thread as usize);
 31 |     while state.pop_layer().is_some() {
 32 |         println!("=== Depth {} ===", state.current_depth);
 33 | 
 34 |         let mut handles = browse_layer(&mut state, &conf).await?;
 35 |         let childs = parse_layer(&mut state, &conf, &mut handles).await?;
 36 |         state.add_to_next_layer(childs);
 37 |         if state.current_depth == conf.args.depth {
 38 |             break;
 39 |         }
 40 |         state.current_depth += 1;
 41 |         println!();
 42 |     }
 43 | 
 44 |     println!("Formatting");
 45 |     format(&conf)?;
 46 | 
 47 |     println!("Displaying");
 48 |     display(&conf)?;
 49 |     Ok(())
 50 | }
 51 | 
 52 | fn format(conf: &Config) -> Result<(), Box<dyn std::error::Error>> {
 53 |     let mut format = |node: &mut Node| {
 54 |         Node::format(node, &conf.args.content, &conf.args.cmd).map_err(Into::into)
 55 |     }; // Need to convert FormatError to Box< ...
 56 |     Node::explore(&conf.root, &mut format)
 57 | }
 58 | 
 59 | fn display(conf: &Config) -> Result<(), Box<dyn std::error::Error>> {
 60 |     let mut display = |node: &mut Node| Node::display(node, &conf.args.cmd).map_err(Into::into);
 61 |     Node::explore(&conf.root, &mut display)
 62 | }
 63 | 
 64 | type FuturesBrowse = JoinSet<(Result<Browser, browser::BrowseError>, Arc<Mutex<Node>>)>;
 65 | 
 66 | // Browse the current layer and generate the chromium browser for the page
 67 | async fn browse_layer(
 68 |     state: &mut State,
 69 |     config: &Config,
 70 | ) -> Result<FuturesBrowse, Box<dyn error::Error>> {
 71 |     let mut handles: FuturesBrowse = JoinSet::new();
 72 |     while let Some(node) = state.current_layer.pop() {
 73 |         if !config.same_domain(&node.lock().unwrap().url)
 74 |             || state.known(&node)
 75 |             || !config.in_bound(&node.lock().unwrap().url)
 76 |         {
 77 |             continue;
 78 |         }
 79 | 
 80 |         let permit = PERMITS.acquire().await?;
 81 |         println!("Visiting {}", node.lock().unwrap().url.as_str().green());
 82 |         handles.spawn(async move {
 83 |             let _permit = permit;
 84 |             let url = node.lock().unwrap().url.clone();
 85 |             (Browser::new_navigate(&url), node)
 86 |         });
 87 |     }
 88 |     Ok(handles)
 89 | }
 90 | 
 91 | // Parse every page of the layer and extract useful information
 92 | // TODO: refactor that shit
 93 | async fn parse_layer(
 94 |     state: &mut State,
 95 |     config: &Config,
 96 |     handles: &mut FuturesBrowse,
 97 | ) -> Result<Vec<Arc<Mutex<Node>>>, Box<dyn std::error::Error>> {
 98 |     println!("Collecting data from every url of the layer");
 99 |     let mut total_count = 0;
100 |     let mut next_layer_childs = Vec::new();
101 |     while let Some(handle) = handles.join_next().await {
102 |         let (browser, parent) = handle?;
103 |         let mut explore_external = false;
104 |         let links = browser?.parse_document(&config.args.content, &parent).await;
105 | 
106 |         let links = links.into_iter().filter_map(|link| {
107 |             if config.same_domain(&link) {
108 |                 Some(link)
109 |             } else if state.current_external < config.args.external {
110 |                 if !explore_external {
111 |                     explore_external = true;
112 |                     state.current_external += 1;
113 |                 }
114 |                 Some(link)
115 |             } else {
116 |                 None
117 |             }
118 |         });
119 | 
120 |         parent.lock().unwrap().explored = true;
121 | 
122 |         let mut childs: Vec<Arc<Mutex<Node>>> = links
123 |             .map(|url| Node::new_arc(Some(&parent), url.clone(), url.to_string()))
124 |             .collect();
125 |         total_count += parent.lock().unwrap().quantity_elements() + childs.len();
126 |         next_layer_childs.append(&mut childs);
127 |     }
128 |     println!(
129 |         "Found a total of {} {:?}",
130 |         total_count.to_string().green(),
131 |         config.args.cmd
132 |     );
133 |     Ok(next_layer_childs)
134 | }
135 | 
136 | fn main() {
137 |     if let Ok(rt) = tokio::runtime::Runtime::new() {
138 |         if let Err(e) = rt.block_on(run()) {
139 |             eprintln!("error {:?}", e);
140 |             process::exit(1);
141 |         }
142 |         return;
143 |     }
144 |     eprintln!("Error: can't start the tokio runtime");
145 |     process::exit(2);
146 | }
147 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Coma - Website Scraper
  2 | 
  3 | > Disclaimer: This project is currently on pause. I made some significant changes in how it's used recently (in the last merge), and it hasn't been tested much since. I plan to continue developing this tool with many great features, but for now, I am working on another project.
  4 | 
  5 | ## Overview
  6 | Coma is a lightweight command-line tool designed for scraping various types of content from web pages, such as text, comments, links, and images. Its simplicity and flexibility make it easy for users to extract the specific data they need from a given URL.
  7 | 
  8 | ![Logo shrimp](static/shrimp.jpg)
  9 | 
 10 | ## Installation
 11 | 
 12 | You can install Coma either by compiling it locally after cloning the repository or by installing it directly from [crates.io](https://crates.io).
 13 | 
 14 | ### Clone and Compile Locally
 15 | 
 16 | 1. **Clone the repository:**
 17 |    ```bash
 18 |    git clone https://github.com/yourusername/coma.git
 19 |    cd coma
 20 |    ```
 21 | 
 22 | 2. **Build the project using Cargo:**
 23 |    ```bash
 24 |    cargo build --release
 25 |    ```
 26 | 
 27 | 3. **Run the compiled binary:**
 28 |    ```bash
 29 |    ./target/release/coma --help
 30 |    ```
 31 | 
 32 | ### Install from crates.io
 33 | 
 34 | To install Coma from crates.io, use the following command:
 35 | ```bash
 36 | cargo install coma
 37 | ```
 38 | 
 39 | This will download and compile Coma, making it available for easy use from the command line.
 40 | 
 41 | ## Program Behavior
 42 | 
 43 | ### Command Structure
 44 | To use Coma, the basic command structure is as follows:
 45 | 
 46 | ```
 47 | coma [OPTIONS] --url <URL> <COMMAND>
 48 | ```
 49 | 
 50 | Where `<URL>` is the website you want to scrape, and `<COMMAND>` specifies what type of data you wish to extract.
 51 | 
 52 | ### Commands
 53 | The available commands enable you to target specific content on the web page:
 54 | 
 55 | - **print**: Print the extracted content in the terminal.
 56 | - **save**: Save the extracted content in files.
 57 | - **graph**: Create an HTML topology of the website.
 58 | - **help**: Displays the help menu, providing information on usage and available options
 59 | 
 60 | ### Options
 61 | 
 62 | Coma includes several options to customize its behavior:
 63 | 
 64 | - `-c, --content <CONTENT>`: Specifies the type of content to scrape. Available values are:
 65 |    - **texts**: Extracts the text present in the HTML of the page.
 66 |    - **comments**: Extracts any comments found in the HTML (such as those in HTML comment tags).
 67 |    - **links**: Extracts all hyperlinks from the page, allowing you to see the navigation structure or related pages.
 68 |    - **images**: Extracts the URLs of images present on the page.
 69 |    - **inputs**: Extracts input fields from forms on the page.
 70 |    - **all**: Extracts all the available types of content. (Default: all)
 71 | 
 72 | - `-u, --url <URL>`: Mandatory option to specify the URL to start the scraping process.
 73 | - `-d, --depth <DEPTH>`: Determines how deep the scraper should go from the specified URL:
 74 |    - `0`: Scrapes only the specified URL.
 75 |    - `<0`: Enables infinite depth, allowing the scraper to traverse through all linked pages.
 76 |    - Default is `0`.  
 77 | 
 78 | - `-b, --bound <BOUND>`: Sets a filter to include only URLs containing a specific substring. This can be useful for limiting the scraping to a specific domain or section of a website. The default value is an empty string, meaning no filtering is applied.
 79 | - `-t, --task <TASK>`: Sets the maximum number of concurrent asynchronous tasks to be made during scraping. The default is set to 5, which balances speed and performance without overwhelming the target server.
 80 | - `-e, --external <EXTERNAL>`: Specifies whether to include external links or not. Default is 0 (exclude external links).
 81 | - `-h, --help`: Prints the help menu for Coma, including usage instructions and command options.
 82 | - `-V, --version`: Displays the current version of Coma.
 83 | 
 84 | ## Plan for the Future
 85 | 
 86 | ### Topology
 87 | 
 88 | The current graph doesn't give the possibility to make directed link which would be great
 89 | 
 90 | I aim to provide the complete topology of the website based on different heuristics:
 91 | - Hierarchy of the website.
 92 | - Discovery from the provided link using BFS (Breadth-First Search) and DFS (Depth-First Search).
 93 | 
 94 | ### Content
 95 | We could add more command options beyond the current selection:
 96 | - Full HTML page
 97 | - Regex patterns inside the texts with some useful preset
 98 | - More html tag
 99 | 
100 | ### Options
101 | It's important to improve the usability of the tool with these options:
102 | - Output of different formats, it would be useful to have CSV, JSON, and maybe more.
103 | - Proxy
104 | - Cookies and header
105 | - Download the images directly
106 | 
107 | ## Conclusion
108 | Coma is a flexible and straightforward tool for anyone needing to scrape data from websites quickly. Users can easily customize their scraping experience through various commands and options, making it suitable for a wide range of web data extraction tasks.
109 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | 
  4 | <head>
  5 |   <title>Network Graph in JavaScript</title>
  6 |   <style type="text/css">
  7 |     html,
  8 |     body {
  9 |       width: 100%;
 10 |       height: 100%;
 11 |       margin: 0;
 12 |       padding: 0;
 13 |       overflow: hidden;
 14 |     }
 15 | 
 16 |     #container {
 17 |       width: 100%;
 18 |       height: 100%;
 19 |       position: relative;
 20 |     }
 21 | 
 22 |     #infoPanel {
 23 |       position: absolute;
 24 |       top: 0;
 25 |       right: 0;
 26 |       width: 300px;
 27 |       height: 100%;
 28 |       background-color: white;
 29 |       border-left: 1px solid #ddd;
 30 |       box-shadow: -2px 0 5px rgba(0, 0, 0, 0.1);
 31 |       padding: 16px;
 32 |       box-sizing: border-box;
 33 |       display: none;
 34 |       /* Hidden by default */
 35 |       overflow-y: auto;
 36 |     }
 37 | 
 38 |     #infoPanel h3 {
 39 |       margin: 0 0 10px;
 40 |       font-size: 18px;
 41 |     }
 42 | 
 43 |     #infoPanel p {
 44 |       margin: 0;
 45 |       color: #333;
 46 |     }
 47 | 
 48 |     .image-grid-container {
 49 |       max-height: 200px;
 50 |       /* Set a max height for the container */
 51 |       overflow-y: auto;
 52 |       /* Enable vertical scrolling if needed */
 53 |     }
 54 | 
 55 |     .image-grid {
 56 |       display: grid;
 57 |       grid-template-columns: repeat(auto-fill, minmax(50px, 1fr));
 58 |       gap: 5px;
 59 |     }
 60 | 
 61 |     .image-grid img {
 62 |       width: 100%;
 63 |       height: auto;
 64 |       display: block;
 65 |     }
 66 | 
 67 |     .comments-list {
 68 |       margin: 10px 0;
 69 |       padding: 0;
 70 |       list-style-type: none;
 71 |     }
 72 | 
 73 |     .comments-list li {
 74 |       margin-bottom: 5px;
 75 |       padding: 5px;
 76 |       border-bottom: 1px solid #ddd;
 77 |     }
 78 | 
 79 |     .inputs-list {
 80 |       margin: 10px 0;
 81 |       padding: 0;
 82 |       list-style-type: none;
 83 |     }
 84 | 
 85 |     .inputs-list li {
 86 |       margin-bottom: 5px;
 87 |       padding: 5px;
 88 |       border-bottom: 1px solid #ddd;
 89 |     }
 90 |   </style>
 91 |   <script src="https://cdn.anychart.com/releases/8.12.1/js/anychart-core.min.js"></script>
 92 |   <script src="https://cdn.anychart.com/releases/8.12.1/js/anychart-graph.min.js"></script>
 93 | </head>
 94 | 
 95 | <body>
 96 |   <div id="container"></div>
 97 |   <div id="infoPanel">
 98 |     <h3>Node Info</h3>
 99 |     <div id="infoContent"></div>
100 |   </div>
101 |   <script>
102 |     anychart.onDocumentReady(function () {
103 |       // Function to escape HTML characters
104 |       function escapeHtml(text) {
105 |         return text
106 |           .replace(/&/g, "&amp;")
107 |           .replace(/</g, "&lt;")
108 |           .replace(/>/g, "&gt;")
109 |           .replace(/"/g, "&quot;")
110 |           .replace(/'/g, "&#039;");
111 |       }
112 | 
113 |       // Create data
114 |       const data = {{graph| json | safe}};
115 | 
116 |       var chart = anychart.graph(data);
117 | 
118 |       var nodes = chart.nodes();
119 |       nodes.labels().enabled(true);
120 |       nodes.labels().fontSize(15);
121 |       nodes.labels().fontColor("black");
122 |       nodes.tooltip().useHtml(true);
123 |       nodes.tooltip().format(function (e) {
124 |         const name = e.getData("id");
125 |         const value = e.getData("value");
126 |         return `<b>${name}<br>${value}</b> km in diameter`;
127 |       });
128 | 
129 |       var edges = chart.edges();
130 |       edges.tooltip().useHtml(true);
131 |       edges.tooltip().format(function (e) {
132 |         const from = e.getData("from");
133 |         const to = e.getData("to");
134 |         const distance = e.getData("distance") * 1000000;
135 |         return `From <b>${from}</b> to <b>${to}<br>${distance}</b> km`;
136 |       });
137 |       edges.stroke("lightblue", 2, "10 5");
138 | 
139 |       chart.title("Coma Network Graph");
140 |       chart.container("container");
141 |       chart.draw();
142 | 
143 |       var infoPanel = document.getElementById("infoPanel");
144 |       var infoContent = document.getElementById("infoContent");
145 |       var lastClickedNode = null;
146 | 
147 |       // Click event for nodes
148 |       chart.listen('click', function (e) {
149 | 
150 |         // Find id
151 |         var id = e.domTarget.tag?.id;
152 |         var node;
153 |         for (var n of data.nodes) {
154 |           if (n.id == id) {
155 |             node = n;
156 |             break;
157 |           }
158 |         }
159 |         if (!node) {
160 |           // Click on the blank area
161 |           infoPanel.style.display = "none";
162 |           return
163 |         }
164 | 
165 |         // Generate HTML for comments
166 |         let commentsHtml = '<ul class="comments-list">';
167 |         if (node.comments && node.comments.length > 0) {
168 |           node.comments.forEach(comment => {
169 |             commentsHtml += `<li>${escapeHtml(comment)}</li>`;
170 |           });
171 |         } else {
172 |           commentsHtml += '<li>No comments available</li>';
173 |         }
174 |         commentsHtml += '</ul>';
175 | 
176 |         let inputsHtml = '<ul class="inputs-list">';
177 |         if (node.inputs && node.inputs.length > 0) {
178 |           node.inputs.forEach(input => {
179 |             inputsHtml += `<li>${escapeHtml(input)}</li>`;
180 |           });
181 |         } else {
182 |           inputsHtml += '<li>No inputs available</li>';
183 |         }
184 |         inputsHtml += '</ul>';
185 | 
186 |         // Generate HTML for images
187 |         console.log(node.images)
188 |         let imagesHtml = '';
189 |         if (node.images && node.images.length > 0) {
190 |           imagesHtml = '<div class="image-grid-container"><div class="image-grid">';
191 |           node.images.forEach(url => {
192 |             imagesHtml += `<a href="${url}" target="_blank"><img src="${url}" alt="Image"></a>`;
193 |           });
194 |           imagesHtml += '</div></div>';
195 |         }
196 | 
197 |         // Update the content of the info panel
198 |         infoContent.innerHTML = `<strong>ID:</strong> ${node.id}<br>` +
199 |           `<strong>Label:</strong> ${node.label}<br>` +
200 |           (node.images.length > 0 ? `<strong>Images:</strong>${imagesHtml}` : "") +
201 |           (node.comments.length > 0 ? `<strong>Comments:</strong>${commentsHtml}` : "") +
202 |           (node.inputs.length > 0 ? `<strong>Inputs:</strong>${inputsHtml}` : "");
203 | 
204 |         // Show the info panel
205 |         infoPanel.style.display = "block";
206 |         lastClickedNode = node;
207 |       });
208 | 
209 |       // Hide the info panel when clicking outside
210 |       document.addEventListener('click', function (e) {
211 |         if (infoPanel.style.display === "block" && !infoPanel.contains(e.target) && !e.target.closest('#container')) {
212 |           infoPanel.style.display = "none";
213 |         }
214 |       });
215 |     });
216 |   </script>
217 | </body>
218 | 
219 | </html>
220 | 


--------------------------------------------------------------------------------