├── .gitattributes ├── imgs ├── history-of-apple.png ├── polish-wikipedia.png └── gemipedia-with-tables.png ├── Gemipedia ├── Models │ ├── SectionItem.cs │ ├── IArticleLinks.cs │ ├── TextContent.cs │ ├── GeoItem.cs │ ├── NavSuggestionsItem.cs │ ├── VideoItem.cs │ ├── MediaItem.cs │ ├── ArticleLink.cs │ ├── ContentItem.cs │ ├── InfoboxItem.cs │ ├── Section.cs │ ├── ParsedPage.cs │ └── ArticleLinkCollection.cs ├── Media │ ├── MediaContent.cs │ └── MediaProcessor.cs ├── Converter │ ├── Special │ │ ├── Tables │ │ │ ├── UnicodeString.cs │ │ │ ├── Table.cs │ │ │ ├── TableParser.cs │ │ │ └── TableRenderer.cs │ │ ├── RedirectParser.cs │ │ ├── NavigationParser.cs │ │ ├── WikiTableConverter.cs │ │ ├── MathConverter.cs │ │ ├── GeoParser.cs │ │ ├── SubscriptConverter.cs │ │ ├── SuperscriptConverter.cs │ │ ├── TextExtractor.cs │ │ ├── GeohackParser.cs │ │ ├── MediaParser.cs │ │ └── InfoboxParser.cs │ ├── Filter │ │ ├── FilterRule.cs │ │ └── DomFilter.cs │ ├── Buffer.cs │ ├── WikiHtmlConverter.cs │ ├── Preparer.cs │ ├── Sectionizer.cs │ └── HtmlParser.cs ├── API │ ├── Models │ │ ├── Article.cs │ │ ├── FeaturedContent.cs │ │ └── ArticleSummary.cs │ ├── WikipediaApiClient.cs │ └── ResponseParser.cs ├── Renderer │ ├── SimpleBuffer.cs │ ├── GalleryRenderer.cs │ ├── ContentRenderer.cs │ ├── GeoRenderer.cs │ ├── ReferencesRenderer.cs │ └── ArticleRenderer.cs ├── UserOptions.cs ├── LanguageUtils.cs ├── Gemipedia.csproj ├── CommonUtils.cs └── RouteOptions.cs ├── Gemipedia.Console ├── Gemipedia.Console.csproj ├── ThreadSafeCounter.cs └── Program.cs ├── Gemipedia.Cgi ├── Gemipedia.Cgi.csproj ├── Program.cs ├── CountingTextWriter.cs └── RouteHandler.cs ├── Test Cases.txt ├── LICENSE ├── TODOs.txt ├── README.md ├── Gemipedia.sln ├── Changelog.txt └── .gitignore /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /imgs/history-of-apple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acidus99/Gemipedia/HEAD/imgs/history-of-apple.png -------------------------------------------------------------------------------- /imgs/polish-wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acidus99/Gemipedia/HEAD/imgs/polish-wikipedia.png -------------------------------------------------------------------------------- /Gemipedia/Models/SectionItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public abstract class SectionItem 4 | { 5 | } -------------------------------------------------------------------------------- /imgs/gemipedia-with-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acidus99/Gemipedia/HEAD/imgs/gemipedia-with-tables.png -------------------------------------------------------------------------------- /Gemipedia/Models/IArticleLinks.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public interface IArticleLinks 4 | { 5 | ArticleLinkCollection Links { get; } 6 | } -------------------------------------------------------------------------------- /Gemipedia/Models/TextContent.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public interface ITextContent : IArticleLinks 4 | { 5 | string Content { get; } 6 | } -------------------------------------------------------------------------------- /Gemipedia/Models/GeoItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class GeoItem : SectionItem 4 | { 5 | public string Title; 6 | public string Url; 7 | } -------------------------------------------------------------------------------- /Gemipedia/Media/MediaContent.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Media; 2 | 3 | public class MediaContent 4 | { 5 | public byte[] Data { get; set; } 6 | public string MimeType { get; set; } 7 | } -------------------------------------------------------------------------------- /Gemipedia/Models/NavSuggestionsItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class NavSuggestionsItem : ContentItem 4 | { 5 | public NavSuggestionsItem(ITextContent textContent) 6 | : base(textContent) { } 7 | } -------------------------------------------------------------------------------- /Gemipedia/Models/VideoItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class VideoItem : MediaItem, IArticleLinks 4 | { 5 | public string VideoUrl { get; set; } 6 | public string VideoDescription { get; set; } 7 | } -------------------------------------------------------------------------------- /Gemipedia/Models/MediaItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class MediaItem : SectionItem, IArticleLinks 4 | { 5 | public string Url { get; set; } 6 | public string Caption { get; set; } 7 | public ArticleLinkCollection Links { get; set; } = new ArticleLinkCollection(); 8 | } 9 | 10 | -------------------------------------------------------------------------------- /Gemipedia.Console/Gemipedia.Console.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Gemipedia/Models/ArticleLink.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class ArticleLink 4 | { 5 | public int Occurences { get; internal set; } 6 | 7 | public string Title { get; private set; } 8 | 9 | internal ArticleLink(string title) 10 | { 11 | Title = title; 12 | Occurences = 1; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/Tables/UnicodeString.cs: -------------------------------------------------------------------------------- 1 | using Wcwidth; 2 | 3 | namespace Gemipedia.Converter.Special.Tables; 4 | 5 | public static class UnicodeString 6 | { 7 | //gets the actually fixed-width of a unicode string 8 | public static int GetWidth(string s) 9 | { 10 | int ret = 0; 11 | foreach (char c in s) 12 | { 13 | ret += UnicodeCalculator.GetWidth(c); 14 | } 15 | return ret; 16 | } 17 | } -------------------------------------------------------------------------------- /Gemipedia/Models/ContentItem.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.Models; 2 | 3 | public class ContentItem : SectionItem, ITextContent 4 | { 5 | public string Content { get; set; } 6 | 7 | public bool HasContent 8 | => (Content.Trim().Length > 0); 9 | 10 | public ArticleLinkCollection Links { get; set; } 11 | 12 | public ContentItem() { } 13 | 14 | public ContentItem(ITextContent textContent) 15 | { 16 | Content = textContent.Content; 17 | Links = textContent.Links; 18 | } 19 | } -------------------------------------------------------------------------------- /Gemipedia/API/Models/Article.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia.API.Models; 2 | 3 | /// 4 | /// Represents a Wikipedia Article 5 | /// 6 | public class Article 7 | { 8 | /// 9 | /// The HTML of the article body 10 | /// 11 | public string HtmlText { get; set; } 12 | 13 | /// 14 | /// The page id 15 | /// 16 | public long PageId { get; set; } 17 | 18 | /// 19 | /// Title of the article 20 | /// 21 | public string Title { get; set; } 22 | } 23 | -------------------------------------------------------------------------------- /Gemipedia.Cgi/Gemipedia.Cgi.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | 7 | 8 | False 9 | None 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Gemipedia/API/Models/FeaturedContent.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Gemipedia.API.Models; 4 | 5 | /// 6 | /// Represents the featured content for the day 7 | /// 8 | public class FeaturedContent 9 | { 10 | /// 11 | /// The featured article of the day 12 | /// 13 | public ArticleSummary FeaturedArticle { get; set; } 14 | 15 | /// 16 | /// The most popular articles of the previous day 17 | /// 18 | public List PopularArticles { get; set; } 19 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Filter/FilterRule.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | namespace Gemipedia.Converter.Filter; 3 | 4 | /// 5 | /// represents a rule of DOM nodes we want to filter 6 | /// 7 | internal class FilterRule 8 | { 9 | public string ClassName { get; set; } = null; 10 | 11 | public string ID { get; set; } = null; 12 | 13 | public string TagName { get; set; } = null; 14 | 15 | public bool HasClass 16 | => !string.IsNullOrEmpty(ClassName); 17 | 18 | public bool HasID 19 | => !string.IsNullOrEmpty(ID); 20 | 21 | public bool HasTag 22 | => !string.IsNullOrEmpty(TagName); 23 | } -------------------------------------------------------------------------------- /Gemipedia.Console/ThreadSafeCounter.cs: -------------------------------------------------------------------------------- 1 | using System.Threading; 2 | 3 | namespace Gemipedia.Console; 4 | 5 | /// 6 | /// Simple, thread safe counter 7 | /// 8 | public class ThreadSafeCounter 9 | { 10 | private int counter; 11 | 12 | public ThreadSafeCounter(int initialValue = 0) 13 | { 14 | counter = initialValue; 15 | } 16 | 17 | public int Increment() 18 | { 19 | int tmp = Interlocked.Increment(ref counter); 20 | return tmp; 21 | } 22 | 23 | public int Decrement() 24 | { 25 | int tmp = Interlocked.Decrement(ref counter); 26 | return tmp; 27 | } 28 | 29 | public int Count 30 | => counter; 31 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/RedirectParser.cs: -------------------------------------------------------------------------------- 1 | using System.Text.RegularExpressions; 2 | 3 | namespace Gemipedia.Converter.Special; 4 | 5 | /// 6 | /// Handles redirects via Wikitext 7 | /// 8 | public static class RedirectParser 9 | { 10 | 11 | static Regex redirectTitle = new Regex("title=\"([^\\\"]+)", RegexOptions.Compiled); 12 | 13 | public static bool IsArticleRedirect(string html) 14 | => html.Contains("
"); 15 | 16 | public static string GetRedirectTitle(string html) 17 | { 18 | Match match = redirectTitle.Match(html); 19 | if (match.Success) 20 | { 21 | return match.Groups[1].Value; 22 | } 23 | return ""; 24 | } 25 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/NavigationParser.cs: -------------------------------------------------------------------------------- 1 | using AngleSharp.Html.Dom; 2 | using Gemipedia.Models; 3 | 4 | namespace Gemipedia.Converter.Special; 5 | 6 | /// 7 | /// parses navigation notes 8 | /// 9 | public class NavigationParser 10 | { 11 | /// 12 | /// Convert a navigation note in a section 13 | /// 14 | /// 15 | /// 16 | public static NavSuggestionsItem ConvertNavigationNote(HtmlElement element) 17 | { 18 | var textExtractor = new TextExtractor 19 | { 20 | ShouldCollapseNewlines = true 21 | }; 22 | textExtractor.Extract(element); 23 | return new NavSuggestionsItem(textExtractor); 24 | } 25 | } -------------------------------------------------------------------------------- /Gemipedia/API/Models/ArticleSummary.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace Gemipedia.API.Models; 4 | 5 | public class ArticleSummary 6 | { 7 | 8 | public string Title { get; set; } 9 | 10 | public long PageId { get; set; } 11 | 12 | public string Description { get; set; } 13 | 14 | public string ThumbnailUrl { get; set; } 15 | 16 | //distance in meters from where you were searching 17 | public int Distance { get; set; } = -1; 18 | 19 | //only used when looking for same article on other Wikipedias 20 | public string LanguageCode { get; set; } 21 | 22 | /// 23 | /// Snippet of text where search term was found. Usually less helpful than description 24 | /// 25 | public string Excerpt { get; set; } 26 | 27 | public bool HasSummary 28 | => !string.IsNullOrEmpty(SummaryText); 29 | 30 | public string SummaryText 31 | => !String.IsNullOrEmpty(Description) ? Description : Excerpt; 32 | } 33 | -------------------------------------------------------------------------------- /Test Cases.txt: -------------------------------------------------------------------------------- 1 | Area of a circle 2 | Math in MathML 3 | Math in texhtml 4 | SUP 5 | SUB 6 | 7 | Broadway theatre 8 | image map 9 | 10 | Chip 'n Dale: Rescue Rangers (film) 11 | Montage images (also WW II) 12 | 13 | Cohomology 14 | Math (in DIVs not spans) 15 | SUP 16 | 17 | "Hello, World!" program 18 | PRE tags 19 | 20 | iPad 21 | Multiple navigation suggestions 22 | Timeline 23 | 24 | 25 | Minor League Baseball 26 | Image with overlays 27 | tables 28 | tables for layout 29 | nested lists 30 | 31 | Pablo Picasso 32 | Gallerys 33 | 34 | Physical constant 35 | SUB that doesn't directly translate 36 | next to to be avoided 37 | 38 | Schitt's Creek 39 | table with colspans and rowspans 40 | 41 | Signal (software) 42 | nested tables in infobox 43 | montage with single caption 44 | links to wikidata 45 | 46 | Unicode subscripts and superscripts 47 | nested tables 48 | 49 | List of Wikipedias 50 | Has table with wide unicode characters 51 | -------------------------------------------------------------------------------- /Gemipedia/Renderer/SimpleBuffer.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace Gemipedia.Renderer; 4 | 5 | public class SimpleBuffer 6 | { 7 | public string Content => sb.ToString(); 8 | 9 | public bool HasContent => (sb.Length > 0); 10 | 11 | public bool AtLineStart 12 | => !HasContent || Content.EndsWith('\n'); 13 | 14 | private StringBuilder sb; 15 | 16 | public SimpleBuffer() 17 | { 18 | sb = new StringBuilder(); 19 | } 20 | 21 | public void Reset() 22 | => sb.Clear(); 23 | 24 | public void Append(string s) 25 | => sb.Append(s); 26 | 27 | public void AppendLine(string s = "") 28 | => sb.AppendLine(s); 29 | 30 | public void PrependLine(string s = "") 31 | { 32 | var existing = sb.ToString(); 33 | sb.Clear(); 34 | sb.AppendLine(s); 35 | sb.Append(existing); 36 | } 37 | 38 | public void EnsureAtLineStart() 39 | { 40 | if (!AtLineStart) 41 | { 42 | sb.AppendLine(); 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /Gemipedia/Renderer/GalleryRenderer.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using Gemipedia.Models; 3 | 4 | namespace Gemipedia.Renderer; 5 | 6 | public class GalleryRenderer 7 | { 8 | TextWriter Writer; 9 | ParsedPage Page; 10 | 11 | public void RenderGallery(ParsedPage parsedPage, TextWriter writer) 12 | { 13 | Writer = writer; 14 | Page = parsedPage; 15 | Writer.WriteLine($"# Image Gallery: {Page.Title}"); 16 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article"); 17 | Writer.WriteLine(); 18 | foreach(var media in Page.GetAllImages()) 19 | { 20 | if (media is VideoItem) 21 | { 22 | var video = (VideoItem)media; 23 | Writer.WriteLine($"=> {video.Url} Video Still: {video.Caption}"); 24 | Writer.WriteLine($"=> {video.VideoUrl} Source Video: {video.VideoDescription}"); ; 25 | } 26 | else 27 | { 28 | Writer.WriteLine($"=> {media.Url} {media.Caption}"); 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Acidus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/WikiTableConverter.cs: -------------------------------------------------------------------------------- 1 | using AngleSharp.Html.Dom; 2 | using Gemipedia.Converter.Special.Tables; 3 | using Gemipedia.Models; 4 | 5 | namespace Gemipedia.Converter.Special; 6 | 7 | public static class WikiTableConverter 8 | { 9 | /// 10 | /// Convert a data table 11 | /// 12 | /// 13 | /// 14 | public static SectionItem ConvertWikiTable(HtmlElement element) 15 | { 16 | 17 | //do we have a timeline? 18 | var media = MediaParser.ConvertTimelineInTable(element); 19 | if (media != null) 20 | { 21 | return media; 22 | } 23 | 24 | TableParser tableParser = new TableParser(); 25 | var table = tableParser.ParseTable(element); 26 | 27 | var contents = TableRenderer.RenderTable(table); 28 | if (contents.Length > 0) 29 | { 30 | return new ContentItem 31 | { 32 | Content = contents, 33 | Links = tableParser.Links 34 | }; 35 | } 36 | return null; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Gemipedia/UserOptions.cs: -------------------------------------------------------------------------------- 1 | namespace Gemipedia; 2 | 3 | public static class UserOptions 4 | { 5 | /// 6 | /// Set which version of Wikipedia we should use. Defaults to EN 7 | /// 8 | public static string WikipediaVersion { get; set; } = "en"; 9 | 10 | public static string LangaugeName => LanguageUtils.GetName(WikipediaVersion); 11 | 12 | //these will depend on the language 13 | public static string[] ExcludedSections 14 | => GetExclusedSections(WikipediaVersion); 15 | 16 | public static string[] ArticleLinkSections 17 | => GetArticleLinkSections(WikipediaVersion); 18 | 19 | static string[] GetExclusedSections(string language) 20 | { 21 | switch (language) 22 | { 23 | default: 24 | return new string[] { "bibliography", "citations", "external_links", "notes", "references", "further_reading" }; 25 | } 26 | } 27 | 28 | static string[] GetArticleLinkSections(string language) 29 | { 30 | switch (language) 31 | { 32 | default: 33 | return new string[] { "see also" }; 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/MathConverter.cs: -------------------------------------------------------------------------------- 1 | using AngleSharp.Html.Dom; 2 | 3 | namespace Gemipedia.Converter.Special; 4 | 5 | public static class MathConverter 6 | { 7 | /// 8 | /// Attempts to convert an inline Math element into a linkable image 9 | /// Math formulas are in SVG, so link to our converter 10 | /// 11 | /// 12 | /// 13 | public static string ConvertMath(HtmlElement element) 14 | { 15 | var img = element.QuerySelector("img"); 16 | var url = img?.GetAttribute("src") ?? ""; 17 | var caption = img?.GetAttribute("alt").Trim().Replace("\n", "") ?? ""; 18 | 19 | if (url.Length > 0 && caption.Length > 0) 20 | { 21 | //not a media item, since it shouldn't be moved 22 | return $"=> {RouteOptions.MediaProxyUrl(MathSvgUrlAsPng(url))} Math Formula: {CleanLatex(caption)}"; 23 | } 24 | return ""; 25 | } 26 | 27 | //wikipedia has direct PNG versions of the SVG math images 28 | private static string MathSvgUrlAsPng(string url) 29 | => url.Replace("/svg/", "/png/"); 30 | 31 | private static string CleanLatex(string latex) 32 | => latex.Replace(@"\displaystyle ", ""); 33 | 34 | } 35 | -------------------------------------------------------------------------------- /TODOs.txt: -------------------------------------------------------------------------------- 1 | Features: 2 | 3 | - requestor only works on happy path. no HTTP/connection/DNS error handling 4 | - stream media bytes directly to client instead of current "store and forward" 5 | - Definitions via Wikitionary: https://en.wiktionary.org/api/rest_v1/#/Page%20content/get_page_definition__term_ 6 | - Landing page: 7 | - Random articles in topcis? 8 | 9 | Better Rendering: 10 | - don't add links to list unless its a "See Also" 11 | - add "related articles" into content (separate page?) 12 | - Better selection of default video type (lots of devices can't do ogg, etc) 13 | - table horizontal lines don't have "+" in right place for colspans 14 | - Article name is wrong (IPad instead of iPad). This is because of the API I am using 15 | 16 | Rendering bugs: 17 | - Empty list entries (Karl Marx infobox) 18 | - tables nested inside tables (Vietnam administrative districts) 19 | 20 | "Sources" section not removed 21 | - "apollo" article 22 | 23 | Blockquote bug: 24 | "Hercule Poirot" in "Appearance and proclivities" 25 | 26 | Infobox: 27 | - handle places that use single row tables as multi columns (band album chronologies use this to show prev, curren and next album 28 | 29 | Eve Jobs 30 | -> photo seems small 31 | 32 | 33 | 34 | 35 | https://www.mediawiki.org/wiki/Specs/HTML/2.7.0#Wiki_links -------------------------------------------------------------------------------- /Gemipedia/Models/InfoboxItem.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | 4 | namespace Gemipedia.Models; 5 | 6 | public class InfoboxItem : SectionItem, IArticleLinks 7 | { 8 | public string CustomTitle { get; set; } = ""; 9 | 10 | public ArticleLinkCollection Links { get; private set; } = new ArticleLinkCollection(); 11 | 12 | public IEnumerable ContentItems 13 | => Items.Where(x => x is ContentItem).Select(x => x as ContentItem); 14 | 15 | public IEnumerable GeoItems 16 | => Items.Where(x => x is GeoItem).Select(x => x as GeoItem); 17 | 18 | public IEnumerable MediaItems 19 | => Items.Where(x => x is MediaItem).Select(x => x as MediaItem); 20 | 21 | public IEnumerable NavSuggestions 22 | => Items.Where(x => x is NavSuggestionsItem).Select(x => x as NavSuggestionsItem); 23 | 24 | private List Items = new List(); 25 | 26 | //force processing 27 | public void AddItems(IEnumerable items) 28 | => items.ToList().ForEach(x => AddItem(x)); 29 | 30 | public void AddItem(SectionItem item) 31 | { 32 | if(item == null) 33 | { 34 | return; 35 | } 36 | 37 | if (item is IArticleLinks && ((IArticleLinks)item).Links != null && !(item is NavSuggestionsItem)) 38 | { 39 | Links.Add(((IArticleLinks)item).Links); 40 | } 41 | Items.Add(item); 42 | } 43 | } -------------------------------------------------------------------------------- /Gemipedia/Renderer/ContentRenderer.cs: -------------------------------------------------------------------------------- 1 | using Gemipedia.Models; 2 | 3 | namespace Gemipedia.Renderer; 4 | 5 | public static class ContentRenderer 6 | { 7 | public static void RenderGeo(SimpleBuffer buffer, GeoItem geo) 8 | { 9 | buffer.EnsureAtLineStart(); 10 | buffer.AppendLine($"=> {geo.Url} 🌍 {geo.Title}"); 11 | } 12 | 13 | public static void RenderMedia(SimpleBuffer buffer, MediaItem media) 14 | { 15 | buffer.EnsureAtLineStart(); 16 | 17 | if (media is VideoItem) 18 | { 19 | var video = (VideoItem)media; 20 | buffer.AppendLine($"=> {video.Url} Video Still: {video.Caption}"); 21 | buffer.AppendLine($"=> {video.VideoUrl} Source Video: {video.VideoDescription}"); ; 22 | } 23 | else 24 | { 25 | buffer.AppendLine($"=> {media.Url} {media.Caption}"); 26 | } 27 | } 28 | 29 | public static void RenderNavSuggestion(SimpleBuffer buffer, NavSuggestionsItem nav) 30 | { 31 | var links = nav.Links.GetLinks(); 32 | if (links.Count == 1) 33 | { 34 | buffer.EnsureAtLineStart(); 35 | buffer.AppendLine($"=> {RouteOptions.ArticleUrl(links[0])} {nav.Content}"); 36 | } 37 | else 38 | { 39 | buffer.EnsureAtLineStart(); 40 | buffer.AppendLine($"({nav.Content})"); 41 | foreach (var linkTitle in links) 42 | { 43 | buffer.AppendLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}"); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Gemipedia/LanguageUtils.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | 4 | namespace Gemipedia; 5 | 6 | public static class LanguageUtils 7 | { 8 | public static readonly string[] CommonLanguages = new string[] { "simple", "ar", "bg", "ca", "ce", "cs", "da", "nl", "en", "eo", "fi", "fr", "de", "he", "hu", "id", "it", "ja", "ko", "ms", "zh", "no", "ga", "pl", "pt", "ro", "ru", "sr", "sh", "es", "sv", "tr", "uk", "vi" }; 9 | 10 | public static string GetName(string langCode) 11 | { 12 | // Special logic for Simple English Wikipedia. 13 | if (IsSimpleEnglish(langCode)) 14 | { 15 | return "Simple English"; 16 | } 17 | 18 | try 19 | { 20 | var ci = new CultureInfo(langCode); 21 | return ci.NativeName == ci.DisplayName ? 22 | ci.NativeName : 23 | $"{ci.NativeName} ({ci.DisplayName})"; 24 | } 25 | catch (Exception) 26 | { 27 | 28 | } 29 | return $"'{langCode}'"; 30 | } 31 | 32 | public static bool IsValidCode(string langCode) 33 | { 34 | // Special logic for Simple English Wikipedia. 35 | if (IsSimpleEnglish(langCode)) 36 | { 37 | return true; 38 | } 39 | 40 | try 41 | { 42 | var ci = new CultureInfo(langCode); 43 | return ci.DisplayName != langCode; 44 | } 45 | catch (Exception) 46 | { } 47 | return false; 48 | } 49 | 50 | private static bool IsSimpleEnglish(string langCode) 51 | => langCode.ToLower() == "simple"; 52 | } -------------------------------------------------------------------------------- /Gemipedia/Gemipedia.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | 6 | 7 | 8 | False 9 | None 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /Gemipedia/CommonUtils.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Text.RegularExpressions; 3 | using AngleSharp.Dom; 4 | 5 | namespace Gemipedia; 6 | 7 | public static class CommonUtils 8 | { 9 | public static string PrepareTextContent(string s) 10 | => s.Trim().Replace("\n", ""); 11 | 12 | 13 | /// 14 | /// Gets a properly formatted image URL from an IMG object 15 | /// 16 | /// 17 | /// 18 | public static string GetImageUrl(IElement img) 19 | { 20 | //try srcset 2x 21 | var url = GetImageFromSrcset(img?.GetAttribute("srcset") ?? "", "2x"); 22 | if (!string.IsNullOrEmpty(url)) 23 | { 24 | return EnsureHttps(url); 25 | } 26 | //try srcset 1.5 27 | url = GetImageFromSrcset(img?.GetAttribute("srcset") ?? "", "1.5x"); 28 | if (!string.IsNullOrEmpty(url)) 29 | { 30 | return EnsureHttps(url); 31 | } 32 | return EnsureHttps(img.GetAttribute("src") ?? null); 33 | } 34 | 35 | public static string EnsureHttps(string url) 36 | => (url != null && !url.StartsWith("https:")) ? 37 | "https:" + url : 38 | url; 39 | 40 | private static string GetImageFromSrcset(string srcset, string size) 41 | { 42 | if (srcset.Length > 0) 43 | { 44 | Regex parser = new Regex(@"(\S*[^,\s])(\s+([\d.]+)(x|w))?"); 45 | 46 | return parser.Matches(srcset) 47 | .Where(x => x.Success && x.Groups[2].Value.Trim() == size) 48 | .Select(x => x.Groups[1].Value).FirstOrDefault() ?? null; 49 | } 50 | return null; 51 | } 52 | 53 | 54 | } 55 | -------------------------------------------------------------------------------- /Gemipedia/Media/MediaProcessor.cs: -------------------------------------------------------------------------------- 1 | using ImageMagick; 2 | 3 | namespace Gemipedia.Media; 4 | 5 | /// 6 | /// Reformats media from Wikipedia to better suit Gemini clients 7 | /// 8 | public static class MediaProcessor 9 | { 10 | public static MediaContent ProcessImage(byte[] data) 11 | { 12 | using (var image = new MagickImage(data)) 13 | { 14 | 15 | if (image.Format == MagickFormat.Svg) 16 | { 17 | //convert it to PNG 18 | image.Format = MagickFormat.Png; 19 | return ToContent(image); 20 | } 21 | else if (!image.IsOpaque) 22 | { 23 | //add a white background to transparent images to 24 | //make them visible on clients with a dark theme 25 | image.BackgroundColor = new MagickColor("white"); 26 | image.Alpha(AlphaOption.Remove); 27 | return ToContent(image); 28 | } 29 | else 30 | { 31 | //nothing needed (e.g. JPG, etc) so pass it through 32 | return new MediaContent 33 | { 34 | Data = data, 35 | MimeType = GetMime(image) 36 | }; 37 | } 38 | } 39 | } 40 | 41 | private static string GetMime(MagickImage image) 42 | { 43 | string? mimeType = MagickFormatInfo.Create(image.Format)?.MimeType; 44 | return mimeType ?? "image/png"; 45 | } 46 | 47 | private static MediaContent ToContent(MagickImage image) 48 | => new MediaContent 49 | { 50 | Data = image.ToByteArray(), 51 | MimeType = GetMime(image) 52 | }; 53 | } 54 | -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/GeoParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using AngleSharp.Dom; 3 | using Gemipedia.Models; 4 | 5 | namespace Gemipedia.Converter.Special; 6 | 7 | public static class GeoParser 8 | { 9 | public const string GeohackHostname = "geohack.toolforge.org"; 10 | 11 | public static bool IsGeoLink(IElement anchor) 12 | { 13 | //only external links can be a link to geohack. 14 | //This fast-fails so we don't parse a bunch of relative, local, URLs 15 | if (!(anchor.GetAttribute("class")?.Contains("external") ?? false)) 16 | { 17 | return false; 18 | } 19 | return IsGeohackUrl(anchor.GetAttribute("href")); 20 | } 21 | 22 | /// 23 | /// Is this url a valid link to the Wikipedia Geohack server? 24 | /// 25 | /// 26 | /// 27 | public static bool IsGeohackUrl(string? url) 28 | { 29 | if (url == null) 30 | { 31 | return false; 32 | } 33 | 34 | try 35 | { 36 | Uri parsedUrl = new Uri(url); 37 | return (parsedUrl.IsAbsoluteUri && parsedUrl.Host == GeohackHostname); 38 | } 39 | catch (Exception) 40 | { 41 | } 42 | return false; 43 | } 44 | 45 | public static GeoItem ParseGeo(IElement anchor) 46 | { 47 | string url = anchor.GetAttribute("href"); 48 | url = CommonUtils.EnsureHttps(url); 49 | 50 | GeohackParser geohack = new GeohackParser(url); 51 | if (geohack.IsValid) 52 | { 53 | return new GeoItem 54 | { 55 | Title = $"View Geographic Info: {geohack.GetPrettyName()} ({geohack.Coordinates})", 56 | Url = RouteOptions.GeoUrl(geohack.GeohackUrl) 57 | }; 58 | } 59 | return null; 60 | } 61 | } 62 | 63 | -------------------------------------------------------------------------------- /Gemipedia/Models/Section.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Diagnostics; 3 | using System.Linq; 4 | using AngleSharp.Dom; 5 | 6 | namespace Gemipedia.Models; 7 | 8 | [DebuggerDisplay("Section '{Title}'")] 9 | public class Section : IArticleLinks 10 | { 11 | public List Nodes = new List(); 12 | 13 | public ArticleLinkCollection Links { get; private set; } = new ArticleLinkCollection(); 14 | 15 | public bool HasSubSections => (SubSections.Count > 0); 16 | 17 | //infoboxes 18 | public List Infoboxes = new List(); 19 | 20 | //content and images 21 | public List GeoItems = new List(); 22 | 23 | //content and images 24 | public List GeneralContent = new List(); 25 | 26 | public bool HasNavSuggestions 27 | => NavSuggestions.Count > 0; 28 | 29 | public List NavSuggestions = new List(); 30 | 31 | public List
SubSections { get; set; }= new List
(); 32 | 33 | //force processing 34 | public void AddItems(IEnumerable items) 35 | => items.ToList().ForEach(x => AddItem(x)); 36 | 37 | private void AddItem(SectionItem item) 38 | { 39 | if (item is IArticleLinks && !(item is NavSuggestionsItem)) 40 | { 41 | Links.Add(((IArticleLinks)item).Links); 42 | } 43 | 44 | if (item is InfoboxItem) 45 | { 46 | Infoboxes.Add((InfoboxItem)item); 47 | } 48 | else if (item is NavSuggestionsItem) 49 | { 50 | NavSuggestions.Add((NavSuggestionsItem)item); 51 | } 52 | else if (item is GeoItem) 53 | { 54 | GeoItems.Add((GeoItem)item); 55 | 56 | } 57 | else 58 | { 59 | GeneralContent.Add(item); 60 | } 61 | } 62 | 63 | //special sections don't have titles. the intro section is a special section 64 | public bool IsSpecial { get; set; } = false; 65 | 66 | public int SectionDepth { get; set; } 67 | public string Title { get; set; } 68 | 69 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Buffer.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using Gemipedia.Models; 3 | 4 | namespace Gemipedia.Converter; 5 | 6 | public class Buffer : ITextContent 7 | { 8 | public ArticleLinkCollection Links { get; private set; } 9 | 10 | public string Content => sb.ToString(); 11 | 12 | public bool HasContent => (sb.Length > 0); 13 | 14 | public bool AtLineStart 15 | => !HasContent || Content.EndsWith('\n'); 16 | 17 | public bool InBlockquote { get; set; } = false; 18 | 19 | private StringBuilder sb; 20 | 21 | private string lineStart = null; 22 | 23 | public Buffer() 24 | { 25 | sb = new StringBuilder(); 26 | Links = new ArticleLinkCollection(); 27 | } 28 | 29 | public void Reset() 30 | { 31 | sb.Clear(); 32 | Links = new ArticleLinkCollection(); 33 | lineStart = null; 34 | } 35 | 36 | public void SetLineStart(string s) 37 | { 38 | lineStart = s; 39 | } 40 | 41 | public void Append(ITextContent textContent) 42 | { 43 | //start consume the data 44 | sb.Append(textContent.Content); 45 | Links.Add(textContent.Links); 46 | } 47 | 48 | public void Append(string s) 49 | { 50 | HandleLineStart(s); 51 | HandleBlockQuote(s); 52 | sb.Append(s); 53 | } 54 | 55 | public void AppendLine(string s = "") 56 | { 57 | HandleLineStart(s); 58 | HandleBlockQuote(s); 59 | sb.AppendLine(s); 60 | } 61 | 62 | public void EnsureAtLineStart() 63 | { 64 | if (AtLineStart && lineStart != null) 65 | { 66 | lineStart = null; 67 | } 68 | 69 | if (!AtLineStart) 70 | { 71 | sb.AppendLine(); 72 | } 73 | } 74 | 75 | public void HandleLineStart(string s) 76 | { 77 | //if we are adding something that is not whitespace, and we have a prefix 78 | if (lineStart != null) 79 | { 80 | sb.Append(lineStart); 81 | lineStart = null; 82 | } 83 | } 84 | 85 | private void HandleBlockQuote(string s) 86 | { 87 | if (InBlockquote && AtLineStart && s.Trim().Length > 0) 88 | { 89 | sb.Append(">"); 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Gemipedia/Renderer/GeoRenderer.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using Gemipedia.Converter.Special; 3 | 4 | namespace Gemipedia.Renderer; 5 | 6 | public class GeoRenderer 7 | { 8 | TextWriter Writer; 9 | 10 | public void RenderGeo(GeohackParser geohack, TextWriter writer) 11 | { 12 | Writer = writer; 13 | 14 | Writer.WriteLine($"# Geographic Info for {geohack.GetPrettyName()}"); 15 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(geohack.ArticleName)} Back to article"); 16 | Writer.WriteLine(); 17 | Writer.WriteLine($"Place: {geohack.GetPrettyName()}"); 18 | if(!geohack.IsEarth) 19 | { 20 | Writer.WriteLine($"Globe: {geohack.Globe}"); 21 | } 22 | if(geohack.HasTypeDescription) 23 | { 24 | Writer.WriteLine($"Type: {geohack.GetTypeDescription()}"); 25 | } 26 | Writer.WriteLine($"Coordinates:"); 27 | Writer.WriteLine($"* Latitude: {geohack.Latitude.ToString("#.####")}"); 28 | Writer.WriteLine($"* Longitude: {geohack.Longitude.ToString("#.####")}"); 29 | Writer.WriteLine(); 30 | 31 | if (geohack.IsEarth) 32 | { 33 | Writer.WriteLine("## Mapping"); 34 | Writer.WriteLine($"=> {OpenStreetMAps(geohack)} Open in OpenStreetMaps.org"); 35 | Writer.WriteLine($"=> {AppleMapsUrl(geohack)} Open in Apple Maps app"); 36 | Writer.WriteLine($"=> {GeoUrl(geohack)} Open in default Andriod Maps app (uses geo: URI)"); 37 | Writer.WriteLine(); 38 | } 39 | 40 | Writer.WriteLine("## Extras"); 41 | Writer.WriteLine($"=> {geohack.GeohackUrl} Open in GeoHack Launcher"); 42 | Writer.WriteLine($"=> {RouteOptions.LonLatUrl(geohack.Latitude, geohack.Longitude, geohack.ArticleName)} Search for nearby articles"); 43 | } 44 | 45 | private string AppleMapsUrl(GeohackParser geohack) 46 | => $"https://maps.apple.com/?q={geohack.Latitude},{geohack.Longitude}&t=m"; 47 | 48 | private string GeoUrl(GeohackParser geohack) 49 | => $"geo:{geohack.Latitude},{geohack.Longitude}?z=5"; 50 | 51 | private string OpenStreetMAps(GeohackParser geohack) 52 | => $"https://www.openstreetmap.org/?mlat={geohack.Latitude}&mlon={geohack.Longitude}&zoom=15"; 53 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/WikiHtmlConverter.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Diagnostics; 3 | using Gemipedia.Converter.Filter; 4 | using Gemipedia.Models; 5 | 6 | namespace Gemipedia.Converter; 7 | 8 | /// 9 | /// Takes HTML generated by media wiki, and converts it into GemText 10 | /// 11 | public class WikiHtmlConverter 12 | { 13 | public long ConvertTimeMs 14 | => convertTimer.ElapsedMilliseconds; 15 | 16 | Stopwatch convertTimer; 17 | 18 | public WikiHtmlConverter() 19 | { 20 | convertTimer = new Stopwatch(); 21 | LoadDomFilters(); 22 | } 23 | 24 | private void LoadDomFilters() 25 | { 26 | DomFilter.Global = new DomFilter(); 27 | //locmaps have overlays we can't display 28 | DomFilter.Global.AddRule("div.locmap"); 29 | //if its not for mobile don't display 30 | DomFilter.Global.AddRule(".nomobile"); 31 | //side category and meta index tables 32 | DomFilter.Global.AddRule("table.sidebar"); 33 | DomFilter.Global.AddRule("table.navbox-vertical"); 34 | //dialogs at top that say something is wrong with the article 35 | DomFilter.Global.AddRule(".metadata"); 36 | DomFilter.Global.AddRule("div.navbox"); 37 | //geo 38 | DomFilter.Global.AddRule("span#coordinates"); 39 | //hidden content 40 | DomFilter.Global.AddRule("div.mw-collapsed"); 41 | } 42 | 43 | public ParsedPage Convert(string title, string wikiHtml) 44 | { 45 | convertTimer.Start(); 46 | var contentRoot = Preparer.PrepareHtml(wikiHtml); 47 | 48 | Sectionizer sectionizer = new Sectionizer(); 49 | 50 | var parsedPage = sectionizer.ParseContent(title, contentRoot); 51 | 52 | ConvertSections(parsedPage.Sections); 53 | convertTimer.Stop(); 54 | return parsedPage; 55 | } 56 | 57 | private void ConvertSections(List
sections) 58 | => sections.ForEach(x => ConvertSection(x)); 59 | 60 | private void ConvertSection(Section section) 61 | { 62 | HtmlParser htmlParser = new HtmlParser(); 63 | 64 | while (section.Nodes.Count > 0) 65 | { 66 | htmlParser.Parse(section.Nodes[0]); 67 | section.Nodes.RemoveAt(0); 68 | } 69 | 70 | section.AddItems(htmlParser.GetItems()); 71 | ConvertSections(section.SubSections); 72 | } 73 | } 74 | 75 | -------------------------------------------------------------------------------- /Gemipedia.Cgi/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Gemini.Cgi; 3 | 4 | namespace Gemipedia.Cgi; 5 | 6 | class Program 7 | { 8 | static void Main(string[] args) 9 | { 10 | SetPaths(); 11 | 12 | CgiRouter router = new CgiRouter(ParseWikiLanguage); 13 | router.OnRequest("/search", RouteHandler.Search); 14 | router.OnRequest("/view", RouteHandler.ViewArticle); 15 | router.OnRequest("/images", RouteHandler.ViewImages); 16 | router.OnRequest("/media", RouteHandler.ProxyMedia); 17 | router.OnRequest("/refs", RouteHandler.ViewRefs); 18 | router.OnRequest("/featured", RouteHandler.ViewFeatured); 19 | router.OnRequest("/geo", RouteHandler.ViewGeo); 20 | router.OnRequest("/latlon", RouteHandler.SearchLatLon); 21 | router.OnRequest("/lang", RouteHandler.SelectLanguage); 22 | router.OnRequest("/otherlang", RouteHandler.ViewOtherLanguages); 23 | router.OnRequest("/setlang", RouteHandler.SetLanguage); 24 | router.OnRequest("/random", RouteHandler.ViewRandomArticle); 25 | router.OnRequest("/", RouteHandler.Welcome); 26 | router.ProcessRequest(); 27 | } 28 | 29 | static void ParseWikiLanguage(CgiWrapper cgi) 30 | { 31 | var parts = cgi.PathInfo.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); 32 | if (parts.Length == 2 && LanguageUtils.IsValidCode(parts[1])) 33 | { 34 | UserOptions.WikipediaVersion = parts[1].ToLower(); 35 | } 36 | } 37 | 38 | static void SetPaths() 39 | { 40 | RouteOptions.BaseArticleUrl = "/cgi-bin/wp.cgi/view"; 41 | RouteOptions.BaseFeaturedContenteUrl = "/cgi-bin/wp.cgi/featured"; 42 | RouteOptions.BaseGeoUrl = "/cgi-bin/wp.cgi/geo"; 43 | RouteOptions.BaseImageGallerUrl = "/cgi-bin/wp.cgi/images"; 44 | RouteOptions.BaseLanguageUrl = "/cgi-bin/wp.cgi/lang"; 45 | RouteOptions.BaseLonLatUrl = "/cgi-bin/wp.cgi/latlon"; 46 | RouteOptions.BaseMediaProxyUrl = "/cgi-bin/wp.cgi/media/media"; 47 | RouteOptions.BaseOtherLanguagesUrl = "/cgi-bin/wp.cgi/otherlang"; 48 | RouteOptions.BaseRandomArticleUrl = "/cgi-bin/wp.cgi/random"; 49 | RouteOptions.BaseReferencesUrl = "/cgi-bin/wp.cgi/refs"; 50 | RouteOptions.BaseSearchUrl = "/cgi-bin/wp.cgi/search"; 51 | RouteOptions.BaseSetLanguageUrl = "/cgi-bin/wp.cgi/setlang"; 52 | RouteOptions.BaseWelcomeUrl = "/cgi-bin/wp.cgi/welcome"; 53 | } 54 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Preparer.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using AngleSharp; 3 | using AngleSharp.Dom; 4 | using AngleSharp.Html.Parser; 5 | 6 | namespace Gemipedia.Converter; 7 | 8 | /// 9 | /// Reads in the Raw HTML, converts it to a DOM, and strips out 10 | /// tags that we don't want before proper parsing 11 | /// 12 | public static class Preparer 13 | { 14 | public static IElement PrepareHtml(string wikiHtml) 15 | { 16 | //step 1: scope Html just to article content 17 | var contentRoot = GetContentRoot(wikiHtml); 18 | 19 | //step 2: remove known bad/unneeded tags 20 | RemoveTags(contentRoot); 21 | 22 | return contentRoot; 23 | } 24 | 25 | private static IElement GetContentRoot(string wikiHtml) 26 | { 27 | var context = BrowsingContext.New(Configuration.Default); 28 | var parser = context.GetService(); 29 | var document = parser.ParseDocument(wikiHtml); 30 | return document.QuerySelector("div.mw-parser-output"); 31 | } 32 | 33 | //Removes tags we no we want need, and which make rendering harder 34 | //often we want to complete remove tags instead of skipping them later 35 | ////with the Filter, since InfoBox parser won't already visit every element 36 | private static void RemoveTags(IElement contentRoot) 37 | { 38 | //remove the table of contents 39 | RemoveMatchingTags(contentRoot, "#toc"); 40 | 41 | //all tags are used to link to references. 42 | RemoveMatchingTags(contentRoot, "sup.reference"); 43 | //all span holders for flag icons 44 | RemoveMatchingTags(contentRoot, "span.flagicon"); 45 | //all tags 46 | RemoveMatchingTags(contentRoot, "link"); 47 | //all style tags 48 | RemoveMatchingTags(contentRoot, "style"); 49 | //geo meta data 50 | RemoveMatchingTags(contentRoot, "span.geo-nondefault"); 51 | RemoveMatchingTags(contentRoot, "span.geo-multi-punct"); 52 | //citation need and other tags 53 | RemoveMatchingTags(contentRoot, ".noprint"); 54 | RemoveMatchingTags(contentRoot, ".mbox"); 55 | RemoveMatchingTags(contentRoot, ".mbox-small"); 56 | //remove the "V T E" meta navbars on certain items 57 | RemoveMatchingTags(contentRoot, ".navbar"); 58 | 59 | //remove interactive elements 60 | RemoveMatchingTags(contentRoot, "div.switcher-container"); 61 | } 62 | 63 | private static void RemoveMatchingTags(IElement element, string selector) 64 | => element.QuerySelectorAll(selector).ToList().ForEach(x => x.Remove()); 65 | 66 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/Tables/Table.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Text; 4 | 5 | namespace Gemipedia.Converter.Special.Tables; 6 | 7 | 8 | public class Table 9 | { 10 | public string Caption = ""; 11 | public List Rows = new List(); 12 | 13 | public bool HasCaption 14 | => Caption.Length > 0; 15 | 16 | public bool IsEmpty 17 | => (Rows.Count == 0); 18 | 19 | /// 20 | /// How many column units wide is this table. Many tables have rows with 21 | /// a mismatched number of columns, or too many colspans. Assume that 22 | /// the number of column units in the first row is accurate, and trim 23 | /// other rows accordingly 24 | /// 25 | public int MaxColumns 26 | => (Rows.Count > 0) ? 27 | Rows[0].Cells.Sum(x => x.ColSpan) : 28 | 0; 29 | } 30 | 31 | public class Row 32 | { 33 | public List Cells = new List(); 34 | 35 | public bool IsEmpty => (Cells.Count == 0); 36 | 37 | public int LineHeight => IsEmpty 38 | ? 0 : Cells.Max(x => x.LineHeight); 39 | } 40 | 41 | public class Cell 42 | { 43 | public bool IsHeader = false; 44 | 45 | private string contents = ""; 46 | public string Contents 47 | { 48 | get => contents; 49 | set 50 | { 51 | contents = StripZeroWidth(value); 52 | } 53 | } 54 | 55 | public int ColSpan = 1; 56 | 57 | public int RowSpan = 1; 58 | //is this a dummy cell, only present to hold open a row spanning cell from a row above? 59 | public bool IsRowSpanHolder = false; 60 | 61 | public List FormattedLines; 62 | 63 | public int LineHeight 64 | => FormattedLines?.Count ?? 0; 65 | 66 | public int FormattedWidth 67 | => (FormattedLines?.Count > 0) ? UnicodeString.GetWidth(FormattedLines[0]) : 0; 68 | 69 | /// 70 | /// removes any zero-width unicode characters from the string 71 | /// these will mess with our column layout since .Lenth with return a number 72 | /// longer than the number of characters that are rendered 73 | /// 74 | /// 75 | /// 76 | private string StripZeroWidth(string s) 77 | { 78 | //Replace("\u200b", "") does not appear to work for these unicode characters 79 | //do it char by char 80 | var sb = new StringBuilder(s.Length); 81 | foreach(char c in s) 82 | { 83 | if(c == '\u200b' || c == '\ufeff') 84 | { 85 | continue; 86 | } 87 | sb.Append(c); 88 | } 89 | return sb.ToString(); 90 | } 91 | } -------------------------------------------------------------------------------- /Gemipedia/Models/ParsedPage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace Gemipedia.Models; 6 | 7 | public class ParsedPage 8 | { 9 | private int currSection = 0; 10 | 11 | public String Title { get; set; } 12 | 13 | /// 14 | /// returns an underline escaped version of the title, used by various APIs 15 | /// 16 | public string EscapedTitle 17 | => Title.Replace(" ", "_"); 18 | 19 | public List
Sections { get; set; } = new List
(); 20 | 21 | public List GetAllImages() 22 | { 23 | var ret = new List(); 24 | foreach (var section in Sections) 25 | { 26 | CollectorHelper(section, ret); 27 | } 28 | return ret; 29 | } 30 | 31 | public int GetReferenceCount() 32 | { 33 | int count = 0; 34 | foreach (var section in Sections) 35 | { 36 | count += GetSectionCount(section); 37 | } 38 | return count; 39 | } 40 | 41 | private int GetSectionCount(Section section) 42 | { 43 | int subSectionCount = 0; 44 | foreach (var sub in section.SubSections) 45 | { 46 | subSectionCount += GetSectionCount(sub); 47 | } 48 | return subSectionCount + section.Links.Count; 49 | } 50 | 51 | public Section GetSection(int sectionNum) 52 | { 53 | currSection = 0; 54 | foreach (var sub in Sections) 55 | { 56 | var section = GetSectionHelper(sub, sectionNum); 57 | if (section != null) 58 | { 59 | return section; 60 | } 61 | } 62 | return null; 63 | } 64 | 65 | private Section GetSectionHelper(Section curr, int lookingFor) 66 | { 67 | currSection++; 68 | if (currSection == lookingFor) 69 | { 70 | return curr; 71 | } 72 | if (curr.HasSubSections) 73 | { 74 | foreach (var sub in curr.SubSections) 75 | { 76 | var section = GetSectionHelper(sub, lookingFor); 77 | if (section != null) 78 | { 79 | return section; 80 | } 81 | } 82 | } 83 | return null; 84 | } 85 | 86 | private void CollectorHelper(Section section, List images) 87 | { 88 | images.AddRange(section.GeneralContent.Where(x => x is MediaItem).Select(x => (MediaItem)x)); 89 | section.Infoboxes.ForEach(x => images.AddRange(x.MediaItems)); 90 | foreach (var subSection in section.SubSections) 91 | { 92 | CollectorHelper(subSection, images); 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /Gemipedia.Cgi/CountingTextWriter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Text; 4 | 5 | namespace Gemipedia.Cgi; 6 | 7 | public class CountingTextWriter : TextWriter 8 | { 9 | private readonly TextWriter _innerWriter; 10 | private int _characterCount; 11 | private long _byteCount; 12 | 13 | public CountingTextWriter(TextWriter innerWriter) 14 | { 15 | _innerWriter = innerWriter ?? throw new ArgumentNullException(nameof(innerWriter)); 16 | _characterCount = 0; 17 | _byteCount = 0; 18 | } 19 | 20 | public override Encoding Encoding => _innerWriter.Encoding; 21 | 22 | public int CharacterCount => _characterCount; 23 | 24 | public long ByteCount => _byteCount; 25 | 26 | public override void Write(char value) 27 | { 28 | _innerWriter.Write(value); 29 | _characterCount++; 30 | _byteCount += Encoding.GetByteCount(new[] { value }); 31 | } 32 | 33 | public override void Write(char[] buffer, int index, int count) 34 | { 35 | _innerWriter.Write(buffer, index, count); 36 | _characterCount += count; 37 | _byteCount += Encoding.GetByteCount(buffer, index, count); 38 | } 39 | 40 | public override void Write(string value) 41 | { 42 | if (value != null) 43 | { 44 | _innerWriter.Write(value); 45 | _characterCount += value.Length; 46 | _byteCount += Encoding.GetByteCount(value); 47 | } 48 | } 49 | 50 | public override void WriteLine() 51 | { 52 | _innerWriter.WriteLine(); 53 | _characterCount += Environment.NewLine.Length; 54 | _byteCount += Encoding.GetByteCount(Environment.NewLine); 55 | } 56 | 57 | public override void WriteLine(string value) 58 | { 59 | if (value != null) 60 | { 61 | _innerWriter.WriteLine(value); 62 | _characterCount += value.Length + Environment.NewLine.Length; 63 | _byteCount += Encoding.GetByteCount(value + Environment.NewLine); 64 | } 65 | else 66 | { 67 | WriteLine(); 68 | } 69 | } 70 | 71 | public override void WriteLine(char[] buffer, int index, int count) 72 | { 73 | _innerWriter.WriteLine(buffer, index, count); 74 | _characterCount += count + Environment.NewLine.Length; 75 | _byteCount += Encoding.GetByteCount(new string(buffer, index, count) + Environment.NewLine); 76 | } 77 | 78 | public override void WriteLine(char value) 79 | { 80 | Write(value); 81 | WriteLine(); 82 | } 83 | 84 | protected override void Dispose(bool disposing) 85 | { 86 | if (disposing) 87 | { 88 | _innerWriter?.Dispose(); 89 | } 90 | base.Dispose(disposing); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gemipedia 2 | Gemipedia is a [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) frontend to Wikipedia, focused on providing a delightful reading experience. Gemipedia supports accessing all language-specific Wikipedias. 3 | 4 | ![Gemipedia in Lagrange](imgs/history-of-apple.png) 5 | *Rendering main article with navigation suggestions* 6 | 7 | ![tables and sections](imgs/gemipedia-with-tables.png) 8 | *Handling sections, references, and tables* 9 | 10 | ![tables and sections](imgs/polish-wikipedia.png) 11 | *Accessing Polish Wikipedia* 12 | 13 | ## Demo 14 | Visit `gemini://gemi.dev/cgi-bin/wp.cgi/` with a [Gemini client](https://github.com/kr1sp1n/awesome-gemini) or [via an HTTP-to-Gemini proxy](https://portal.mozz.us/gemini/gemi.dev/cgi-bin/wp.cgi/) 15 | 16 | ## Features 17 | 18 | ### Content Discovery 19 | * Access all language-specific versions of Wikipedia 20 | * Search results with article descriptions, excerpts, and feature image links to find correct content more quickly 21 | * Supports fuzzy matching for finding articles via "Go to article" 22 | * Featured Content, updated daily, which displaying the article of the day and the 25 most popular stories on Wikipedia from the previous day (same as the front page of Wikipedia) 23 | * "Other Articles" feature. Finds other articles that reference the current article 24 | * "Articles near this article" feature. Finds other articles that are geographic close to the current article 25 | * Supports disambiguation and "see other" navigation links 26 | 27 | ### Usability 28 | 29 | * Groups all the links to additional articles by section, and separate "References" pages for each section 30 | * Gallery View, which pulls all media like images and video out into a separate view 31 | * Images/Media is displayed with intelligently created captions 32 | * Geographic view! Extracts geographic coordinates and provides links to OpenStreetMaps and native Map apps 33 | * Removes superfluous content for Gemini like references, citations, bibliographies, and links to external websites 34 | * Links to source article on Wikipedia over HTTPS 35 | * Caches calls to Wikipedia to speed up viewing sub sections or page refreshing 36 | 37 | ### Rendering 38 | * Supports tables, including cells that span multiple rows or columns, by converting them to ASCII art tables inside of preformatted sections 39 | * Selects high resolution images while still using appropriate size for Gemini/smolweb ethos 40 | * Supports math formulas by displaying them as link lines to PNG images 41 | * Support chemical and physics formulas by converting subscripts and superscript tags into Unicode Subscript/Superscript characters! 42 | * Supports image maps 43 | * Supports Infoxbox rendering 44 | * Supports timeline images and tables 45 | * Supports image galleries 46 | * Add White background to transparent images for better reading on clients with dark mode 47 | 48 | ### Offline Support 49 | * Images are proxied from Wikipedia and rendered with appropriate file extension and MIME type for better offline rendering 50 | * PDF export for offline reading 51 | -------------------------------------------------------------------------------- /Gemipedia/Models/ArticleLinkCollection.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using AngleSharp.Dom; 4 | 5 | namespace Gemipedia.Models; 6 | 7 | public class ArticleLinkCollection 8 | { 9 | Dictionary articles; 10 | 11 | public ArticleLinkCollection() 12 | { 13 | articles = new Dictionary(); 14 | } 15 | 16 | public void Clear() 17 | => articles.Clear(); 18 | 19 | public bool HasLinks 20 | => (Count > 0); 21 | 22 | public int Count 23 | => articles.Count; 24 | 25 | public void Add(ArticleLinkCollection collection) 26 | { 27 | foreach (string key in collection.articles.Keys) 28 | { 29 | if (!articles.ContainsKey(key)) 30 | { 31 | articles[key] = collection.articles[key]; 32 | } 33 | else 34 | { 35 | articles[key].Occurences++; 36 | } 37 | } 38 | } 39 | 40 | public void Add(IArticleLinks itemWithLinks) 41 | => Add(itemWithLinks.Links); 42 | 43 | public void Add(string title) 44 | { 45 | if (string.IsNullOrEmpty(title)) 46 | { 47 | return; 48 | } 49 | 50 | var key = title.ToLower(); 51 | 52 | if (!articles.ContainsKey(key)) 53 | { 54 | articles[key] = new ArticleLink(title); 55 | } 56 | else 57 | { 58 | articles[key].Occurences++; 59 | } 60 | } 61 | 62 | public void Add(IElement element) 63 | { 64 | if (ShouldUseLink(element)) 65 | { 66 | Add(RemoveFragment(element.GetAttribute("title"))); 67 | } 68 | } 69 | 70 | private string RemoveFragment(string title) 71 | { 72 | var index = title.IndexOf('#'); 73 | return index > 0 ? title.Substring(0, index) : title; 74 | } 75 | 76 | 77 | 78 | public List GetLinks() 79 | => articles.Keys.OrderBy(x => x).Select(x => articles[x].Title).ToList(); 80 | 81 | public static bool ShouldUseLink(IElement element) 82 | { 83 | //wiki articles have a title attribute 84 | if (!element.HasAttribute("title")) 85 | { 86 | return false; 87 | } 88 | //links to pages that don't exist have a "new" class 89 | if (element.ClassList.Contains("new") || element.ClassList.Contains("internal")) 90 | { 91 | return false; 92 | } 93 | //hyperlinks should be relative, and start with "/wiki/" 94 | if (!(element.GetAttribute("href") ?? "").StartsWith("/wiki/")) 95 | { 96 | return false; 97 | } 98 | //should not be a link a special page 99 | var title = element.GetAttribute("title"); 100 | if (title.StartsWith("Special:")) 101 | { 102 | return false; 103 | } 104 | if (title.StartsWith("Help:")) 105 | { 106 | return false; 107 | } 108 | 109 | return true; 110 | } 111 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/SubscriptConverter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace Gemipedia.Converter.Special; 4 | 5 | public class SubscriptConverter 6 | { 7 | 8 | public string Original { get; private set; } 9 | public string Converted { get; private set; } 10 | 11 | StringBuilder buffer = new StringBuilder(); 12 | 13 | public bool IsFullyConverted { get; private set; } = true; 14 | 15 | public bool Convert(string s) 16 | { 17 | Original = s; 18 | Converted = ""; 19 | 20 | buffer.Clear(); 21 | IsFullyConverted = true; 22 | foreach(char c in s) 23 | { 24 | buffer.Append(ConvertChar(c)); 25 | } 26 | Converted = buffer.ToString(); 27 | return IsFullyConverted; 28 | } 29 | 30 | public char ConvertChar(char c) 31 | { 32 | switch(c) 33 | { 34 | case '0': 35 | return '\u2080'; 36 | case '1': 37 | return '\u2081'; 38 | case '2': 39 | return '\u2082'; 40 | case '3': 41 | return '\u2083'; 42 | case '4': 43 | return '\u2084'; 44 | case '5': 45 | return '\u2085'; 46 | case '6': 47 | return '\u2086'; 48 | case '7': 49 | return '\u2087'; 50 | case '8': 51 | return '\u2088'; 52 | case '9': 53 | return '\u2089'; 54 | 55 | //ASCII plus 56 | case '+': 57 | //small plus sign 58 | case '\uFE62': 59 | //full width plus sign 60 | case '\uFF0B': 61 | return '\u208A'; 62 | 63 | //ASCII minus 64 | case '-': 65 | //small hyphen-minus 66 | case '\uFE63': 67 | //full width plus sign 68 | case '\uFF0D': 69 | //minus sign 70 | case '\u2212': 71 | return '\u208B'; 72 | 73 | //ASCII equals 74 | case '=': 75 | //small equals sign 76 | case '\uFE66': 77 | //full width equals sign 78 | case '\uFF1D': 79 | return '\u208C'; 80 | 81 | case '(': 82 | return '\u208D'; 83 | case ')': 84 | return '\u208E'; 85 | 86 | //some letters 87 | case 'a': 88 | case 'A': 89 | return '\u2090'; 90 | case 'e': 91 | case 'E': 92 | return '\u2091'; 93 | case 'h': 94 | case 'H': 95 | return '\u2095'; 96 | case 'i': 97 | case 'I': 98 | return '\u1D62'; 99 | case 'j': 100 | case 'J': 101 | return '\u2C7C'; 102 | case 'k': 103 | case 'K': 104 | return '\u2096'; 105 | case 'l': 106 | case 'L': 107 | return '\u2097'; 108 | case 'm': 109 | case 'M': 110 | return '\u2098'; 111 | case 'n': 112 | case 'N': 113 | return '\u2099'; 114 | case 'o': 115 | case 'O': 116 | return '\u2092'; 117 | case 'p': 118 | case 'P': 119 | return '\u209A'; 120 | case 'r': 121 | case 'R': 122 | return '\u1D63'; 123 | case 's': 124 | case 'S': 125 | return '\u209B'; 126 | case 't': 127 | case 'T': 128 | return '\u209C'; 129 | case 'u': 130 | case 'U': 131 | return '\u1D64'; 132 | case 'v': 133 | case 'V': 134 | return '\u1D65'; 135 | case 'x': 136 | case 'X': 137 | return '\u2093'; 138 | 139 | //greek 140 | case 'β': 141 | return '\u1D66'; 142 | case 'γ': 143 | return '\u1D67'; 144 | case 'ρ': 145 | return '\u1D68'; 146 | case 'φ': 147 | return '\u1D69'; 148 | case 'χ': 149 | return '\u1D6A'; 150 | } 151 | IsFullyConverted = false; 152 | return c; 153 | } 154 | } 155 | 156 | -------------------------------------------------------------------------------- /Gemipedia.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.810.19 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia", "Gemipedia\Gemipedia.csproj", "{39080D45-3B8E-421D-8558-24AD7F0E448F}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemini.Cgi", "..\Gemini.Cgi\Gemini.Cgi.csproj", "{3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}" 9 | EndProject 10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{519ED394-B5EA-433F-B298-27949E27193F}" 11 | ProjectSection(SolutionItems) = preProject 12 | TODOs.txt = TODOs.txt 13 | Changelog.txt = Changelog.txt 14 | Test Cases.txt = Test Cases.txt 15 | EndProjectSection 16 | EndProject 17 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia.Cgi", "Gemipedia.Cgi\Gemipedia.Cgi.csproj", "{678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}" 18 | EndProject 19 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia.Console", "Gemipedia.Console\Gemipedia.Console.csproj", "{FF353EC3-647B-40A2-9055-2E96AE127AB2}" 20 | EndProject 21 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CacheComms", "..\CacheComms\CacheComms.csproj", "{88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}" 22 | EndProject 23 | Global 24 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 25 | Debug|Any CPU = Debug|Any CPU 26 | Release|Any CPU = Release|Any CPU 27 | EndGlobalSection 28 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 29 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 30 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Debug|Any CPU.Build.0 = Debug|Any CPU 31 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Release|Any CPU.ActiveCfg = Release|Any CPU 32 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Release|Any CPU.Build.0 = Release|Any CPU 33 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 34 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Debug|Any CPU.Build.0 = Debug|Any CPU 35 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Release|Any CPU.ActiveCfg = Release|Any CPU 36 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Release|Any CPU.Build.0 = Release|Any CPU 37 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 38 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Debug|Any CPU.Build.0 = Debug|Any CPU 39 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Release|Any CPU.ActiveCfg = Release|Any CPU 40 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Release|Any CPU.Build.0 = Release|Any CPU 41 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 42 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Debug|Any CPU.Build.0 = Debug|Any CPU 43 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Release|Any CPU.ActiveCfg = Release|Any CPU 44 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Release|Any CPU.Build.0 = Release|Any CPU 45 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 46 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Debug|Any CPU.Build.0 = Debug|Any CPU 47 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Release|Any CPU.ActiveCfg = Release|Any CPU 48 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Release|Any CPU.Build.0 = Release|Any CPU 49 | EndGlobalSection 50 | GlobalSection(SolutionProperties) = preSolution 51 | HideSolutionNode = FALSE 52 | EndGlobalSection 53 | GlobalSection(ExtensibilityGlobals) = postSolution 54 | SolutionGuid = {41FF9B16-4FB0-4875-A3D2-7D214963C613} 55 | EndGlobalSection 56 | EndGlobal 57 | -------------------------------------------------------------------------------- /Gemipedia.Console/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading; 4 | using Gemipedia.API; 5 | using Gemipedia.API.Models; 6 | using Gemipedia.Converter; 7 | using Gemipedia.Converter.Special; 8 | using Gemipedia.Models; 9 | using Gemipedia.Renderer; 10 | 11 | namespace Gemipedia.Console; 12 | 13 | class Program 14 | { 15 | static ThreadSafeCounter counter = new ThreadSafeCounter(); 16 | 17 | static void Main(string[] args) 18 | { 19 | //StressTest(); 20 | 21 | do 22 | { 23 | System.Console.WriteLine("Article?"); 24 | string name = System.Console.ReadLine(); 25 | if (name == "") 26 | { 27 | return; 28 | } 29 | var article = GetArticle(name); 30 | if (article != null) 31 | { 32 | var newConverter = new WikiHtmlConverter(); 33 | 34 | ParsedPage page = newConverter.Convert(article.Title, article.HtmlText); 35 | 36 | var renderer = new ArticleRenderer(); 37 | renderer.RenderArticle(page, System.Console.Out); 38 | } 39 | else 40 | { 41 | System.Console.WriteLine("error fetching article"); 42 | } 43 | 44 | } while (true); 45 | } 46 | 47 | static void StressTest() 48 | { 49 | int workers = 20; 50 | for (int i = 0; i < workers; i++) 51 | { 52 | var thread = new Thread(new ThreadStart(DoStressWork)); 53 | thread.Start(); 54 | } 55 | 56 | while (true) 57 | { 58 | Thread.Sleep(30000); 59 | } 60 | } 61 | 62 | static void DoStressWork() 63 | { 64 | var converter = new WikiHtmlConverter(); 65 | 66 | var client = new WikipediaApiClient(UserOptions.WikipediaVersion); 67 | 68 | while (true) 69 | { 70 | var count = counter.Increment(); 71 | var title = client.GetRandomArticleTitle(); 72 | 73 | try 74 | { 75 | var article = GetArticle(title); 76 | System.Console.WriteLine($"{count}\t{title}"); 77 | ParsedPage page = converter.Convert(article.Title, article.HtmlText); 78 | 79 | StringWriter fout = new StringWriter(); 80 | var renderer = new ArticleRenderer(); 81 | renderer.RenderArticle(page, fout); 82 | } 83 | catch (Exception ex) 84 | { 85 | System.IO.File.AppendAllText("/Users/billy/tmp/ERRORS.txt", $"\"{title}\"\t{ex.Message} - {ex.Source}\n==={ex.StackTrace}==="); 86 | } 87 | System.Threading.Thread.Sleep(100); 88 | } 89 | } 90 | 91 | static Article GetArticle(string title) 92 | { 93 | 94 | var client = new WikipediaApiClient(UserOptions.WikipediaVersion); 95 | Article ret; 96 | 97 | bool gotArticle = true; 98 | do 99 | { 100 | gotArticle = true; 101 | ret = client.GetArticle(title); 102 | if (ret == null) 103 | { 104 | return ret; 105 | } 106 | 107 | if (RedirectParser.IsArticleRedirect(ret.HtmlText)) 108 | { 109 | gotArticle = false; 110 | title = RedirectParser.GetRedirectTitle(ret.HtmlText); 111 | } 112 | } while (!gotArticle); 113 | 114 | return ret; 115 | } 116 | } -------------------------------------------------------------------------------- /Gemipedia/Renderer/ReferencesRenderer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Linq; 4 | using System.Net; 5 | using Gemipedia.Models; 6 | 7 | namespace Gemipedia.Renderer; 8 | 9 | public class ReferencesRenderer 10 | { 11 | TextWriter Writer; 12 | ParsedPage Page; 13 | 14 | public void RenderReferences(ParsedPage parsedPage, TextWriter writer, int section) 15 | { 16 | Writer = writer; 17 | Page = parsedPage; 18 | 19 | if (section > 0) 20 | { 21 | RenderSectionReferences(section); 22 | } 23 | else 24 | { 25 | RenderAllReferences(); 26 | } 27 | } 28 | 29 | private void RenderSectionReferences(int sectionNum) 30 | { 31 | 32 | var section = Page.GetSection(sectionNum); 33 | if (section != null) 34 | { 35 | var title = SectionName(section); 36 | 37 | Writer.WriteLine($"# References for {Page.Title}: {title}"); 38 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article"); 39 | Writer.WriteLine($"=> {RouteOptions.ReferencesUrl(Page.Title)} See all references for article"); 40 | Writer.WriteLine(); 41 | Writer.WriteLine($"References to other articles in the '{title}' section"); 42 | foreach (var linkTitle in section.Links.GetLinks()) 43 | { 44 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}"); 45 | } 46 | } 47 | Writer.WriteLine(); 48 | Writer.WriteLine($"=> https://en.wikipedia.org/wiki/{WebUtility.UrlEncode(Page.Title)} Source on Wikipedia"); 49 | } 50 | 51 | private string SectionName(Section section) 52 | => section.IsSpecial ? "Summary Section" : section.Title; 53 | 54 | private void RenderAllReferences() 55 | { 56 | Writer.WriteLine($"# References for {Page.Title}"); 57 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article"); 58 | Writer.WriteLine(); 59 | Writer.WriteLine("References to other articles, organized by section"); 60 | foreach (var subSection in Page.Sections.Where(x => !ShouldExcludeSectionIndex(x))) 61 | { 62 | RenderIndexForSection(subSection); 63 | } 64 | Writer.WriteLine(); 65 | Writer.WriteLine($"=> https://en.wikipedia.org/wiki/{WebUtility.UrlEncode(Page.Title)} Source on Wikipedia"); 66 | } 67 | 68 | private void RenderIndexForSection(Section section) 69 | { 70 | //only display the section title if this section has links 71 | if (HasLinks(section)) 72 | { 73 | if (!section.IsSpecial) 74 | { 75 | if (section.SectionDepth == 2) 76 | { 77 | Writer.WriteLine($"## {section.Title}"); 78 | } 79 | else 80 | { 81 | //all other sections are at a level 3 82 | Writer.WriteLine($"### {section.Title}"); 83 | } 84 | } 85 | foreach (var linkTitle in section.Links.GetLinks()) 86 | { 87 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}"); 88 | } 89 | } 90 | if (section.HasSubSections) 91 | { 92 | foreach (var subSection in section.SubSections.Where(x => !ShouldExcludeSectionIndex(x))) 93 | { 94 | RenderIndexForSection(subSection); 95 | } 96 | } 97 | } 98 | 99 | //do we have any links which have no already been rendered? 100 | private bool HasLinks(Section section) 101 | => section.Links.HasLinks; 102 | 103 | private bool ShouldExcludeSectionIndex(Section section) 104 | => UserOptions.ArticleLinkSections.Contains(section.Title?.ToLower()); 105 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Filter/DomFilter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using AngleSharp.Html.Dom; 4 | 5 | namespace Gemipedia.Converter.Filter; 6 | 7 | /// 8 | /// removed DOM objects that match certain rules 9 | /// 10 | public class DomFilter 11 | { 12 | public static DomFilter Global = new DomFilter(); 13 | 14 | Dictionary> TagFilters; 15 | 16 | List JustClassRules; 17 | 18 | List JustIDs; 19 | 20 | public DomFilter() 21 | { 22 | TagFilters = new Dictionary>(); 23 | JustClassRules = new List(); 24 | JustIDs = new List(); 25 | } 26 | 27 | public bool IsElementAllowed(HtmlElement element, string normalizedTagName) 28 | { 29 | //check for tag-specific rules 30 | if (TagFilters.ContainsKey(normalizedTagName)) 31 | { 32 | foreach (var rule in TagFilters[normalizedTagName]) 33 | { 34 | if (rule.HasClass) 35 | { 36 | if (element.ClassList.Contains(rule.ClassName)) 37 | { 38 | return false; 39 | } 40 | } 41 | else if (rule.HasID) 42 | { 43 | if ((element.Id ?? "") == rule.ID) 44 | { 45 | return false; 46 | } 47 | } 48 | } 49 | } 50 | 51 | if (element.ClassList.Length > 0) 52 | { 53 | foreach (var rule in JustClassRules) 54 | { 55 | if (element.ClassList.Contains(rule.ClassName)) 56 | { 57 | return false; 58 | } 59 | } 60 | } 61 | if (!string.IsNullOrEmpty(element.Id)) 62 | { 63 | foreach (var rule in JustIDs) 64 | { 65 | if (element.Id == rule.ID) 66 | { 67 | return false; 68 | } 69 | } 70 | } 71 | return true; 72 | } 73 | 74 | public void AddRule(string selector) 75 | { 76 | string tag = ""; 77 | string cls = ""; 78 | string id = ""; 79 | 80 | if (selector.Contains(".")) 81 | { 82 | tag = ClipBefore(selector, ".").ToLower(); 83 | cls = ClipAfter(selector, "."); 84 | } 85 | else if (selector.Contains("#")) 86 | { 87 | tag = ClipBefore(selector, "#").ToLower(); 88 | id = ClipAfter(selector, "#"); 89 | } 90 | else 91 | { 92 | tag = selector.ToLower(); 93 | } 94 | 95 | var rule = new FilterRule 96 | { 97 | TagName = tag, 98 | ClassName = cls, 99 | ID = id 100 | }; 101 | 102 | if (rule.HasTag) 103 | { 104 | if (!TagFilters.ContainsKey(rule.TagName)) 105 | { 106 | TagFilters[rule.TagName] = new List(); 107 | } 108 | TagFilters[rule.TagName].Add(rule); 109 | } 110 | else if (rule.HasClass) 111 | { 112 | JustClassRules.Add(rule); 113 | } 114 | else if (rule.HasID) 115 | { 116 | JustIDs.Add(rule); 117 | } 118 | } 119 | 120 | private string ClipAfter(string s, string c) 121 | { 122 | int x = s.IndexOf(c); 123 | if (x >= 0 && x + 1 != s.Length) 124 | { 125 | return s.Substring(x + 1); 126 | } 127 | return string.Empty; 128 | } 129 | 130 | private string ClipBefore(string s, string c) 131 | { 132 | int x = s.IndexOf(c); 133 | return x > 0 ? s.Substring(0, x) : String.Empty; 134 | } 135 | } -------------------------------------------------------------------------------- /Changelog.txt: -------------------------------------------------------------------------------- 1 | Changelog: 2 | - Feature: Organize referenced links by section (+ don't show duplicates of links that appear in more than 1 section) 3 | - Feature: separate image gallery for article showing all the media 4 | - Feature: Don't show a section if it doesn't have any content 5 | - Feature: Render tables as ASCII tables 6 | - Feature: Support math elements (SVG images transcoded on the fly as PNG) 7 | - Feature: Use the 2x image for better resolution 8 | - Feature: Include video links in Image gallery 9 | - Feature: Show an image for video media using poster attribute 10 | - Fix: Better support for nested tables 11 | - Fix: Skip audio pronunciation links 12 | - Fix: Skip pronunciation helper links 13 | - Create separate page for reference links, include specific link for each section to show links for that section 14 | - Feature: show count for reference links per section, and for all references 15 | - Feature: support SUP tag 16 | - Fix: remove meta data text (e.g. "Citation needed" and "original research" text/links) 17 | - Fix: Only display 1 format for geo coordinates 18 | - Feature: PDF for article 19 | - Fix: Link to original article on Wikipedia (needed underscore escaping) 20 | - Fix Table padding bug (Zero Width space characters!) - German submarine U-48 (1939) 21 | - Fix: Tables don't support Row spans 22 | - Fix: Collect reference links from table captions 23 | - Fix: Navigation suggestions not rendered if more than 1 hyperlink in a sentence 24 | - Fix: attempt to support tables with malformed colspan/rowspan values (iPod article) 25 | - Fix: properly render tables with incorrect/excessive colspan values (iPod article) 26 | - Feature: Support timelines (extract the image, properly extract article links from MAP tag) 27 | - Fix: exclude navigation sections from section's references 28 | - Fix: exclude fragement from article title when collecting references 29 | - Feature: Speed up views with Disk cache for Wikipedia content 30 | (especially important for sub-pages like gallery, references, since that needs would refetch article JSON) 31 | - Feature: Sharper looking math formulas by referencing Wikipedia's PNGs directly 32 | - Feature: Add White background to transparent images for better reading on clients with dark mode 33 | - Feature: Serve media with proper extension, mimetype, for better downloading (easier to tell if something is an animated GIF, etc) 34 | - Fix: Newlines in captions (Project Gutenberg article) 35 | - Feature: support image maps on all images, not just timelines (Broadway theatre article) 36 | - Fix: complete parser rewrite that supports more content without special case handling, and fixes rendering errors 37 | - Feature: Includes search snippet in search results 38 | - Feature: Use the 1.5x image if 2x image is not available 39 | - Feature: Support for Image Galleries (images would show up using generic media finder, but wasn't getting the appropriate caption) 40 | - Feature: Add links to search for other pages about an article 41 | - Feature: Added Featured Content view from front page of Wikipedia. Contains Featured Article and most popular pages 42 | - Feature: Geographic view! Extracts coordinates and provides links to OpenStreetMaps and native Map apps 43 | - Feature: Find articles near another article 44 | - Feature: Support chemical and physics formulas by converting subscripts and superscript tags into Unicode Subscript/Superscript characters! 45 | - Feature: Support side-by-side comparisons in Infoboxes (basicaly any article about a conflict, .e.g. World War II) 46 | - Fix: Better handling of nested tables in Infoboxes 47 | - Fix: Ignore links to Wikidata 48 | - Feature: Support multiple Wikipedia languages 49 | - Fix: Handle empty infoboxes 50 | - Fix: Handle empty rows/empty tables 51 | - Fix: handle malformed colspans 52 | - Fix: handle incorrect nested headers (H1 in output) 53 | - Fix: handle geo coordinates with missing levels of precision 54 | - Feature: Added support for Simple English Wikipedia 55 | - Fix: Added support for new media HTML structure. See: https://www.mediawiki.org/wiki/Parsoid/Parser_Unification/Media_structure/FAQ 56 | - Fix: Better detection of geohack URLs allowed for more links to be displayed 57 | - Feature: Better captions on galleries and montages 58 | - Fix: Crashing on wide unicode in tables 59 | - Feature: Added download and convert times to footer 60 | - Feature: Added original and converted size to footer -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/SuperscriptConverter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace Gemipedia.Converter.Special; 4 | 5 | public class SuperscriptConverter 6 | { 7 | 8 | public string Original { get; private set; } 9 | public string Converted { get; private set; } 10 | 11 | StringBuilder buffer = new StringBuilder(); 12 | 13 | public bool IsFullyConverted { get; private set; } = true; 14 | 15 | public bool Convert(string s) 16 | { 17 | Original = s; 18 | Converted = ""; 19 | 20 | buffer.Clear(); 21 | IsFullyConverted = true; 22 | foreach(char c in s) 23 | { 24 | buffer.Append(ConvertChar(c)); 25 | if(!IsFullyConverted) 26 | { 27 | return false; 28 | } 29 | } 30 | Converted = buffer.ToString(); 31 | return IsFullyConverted; 32 | } 33 | 34 | public char ConvertChar(char c) 35 | { 36 | switch(c) 37 | { 38 | case '0': 39 | return '\u2070'; 40 | case '1': 41 | return '\u00B9'; 42 | case '2': 43 | return '\u00B2'; 44 | case '3': 45 | return '\u00B3'; 46 | case '4': 47 | return '\u2074'; 48 | case '5': 49 | return '\u2075'; 50 | case '6': 51 | return '\u2076'; 52 | case '7': 53 | return '\u2077'; 54 | case '8': 55 | return '\u2078'; 56 | case '9': 57 | return '\u2079'; 58 | 59 | //ASCII plus 60 | case '+': 61 | //small plus sign 62 | case '\uFE62': 63 | //full width plus sign 64 | case '\uFF0B': 65 | return '\u207A'; 66 | 67 | //ASCII minus 68 | case '-': 69 | //small hyphen-minus 70 | case '\uFE63': 71 | //full width plus sign 72 | case '\uFF0D': 73 | //minus sign 74 | case '\u2212': 75 | return '\u207B'; 76 | 77 | //ASCII equals 78 | case '=': 79 | //small equals sign 80 | case '\uFE66': 81 | //full width equals sign 82 | case '\uFF1D': 83 | return '\u207C'; 84 | 85 | case '(': 86 | return '\u207D'; 87 | case ')': 88 | return '\u207E'; 89 | 90 | //Lowercase 91 | case 'a': 92 | return '\u1D43'; 93 | case 'b': 94 | return '\u1D47'; 95 | case 'c': 96 | return '\u1D9C'; 97 | case 'd': 98 | return '\u1D48'; 99 | case 'e': 100 | return '\u1D49'; 101 | case 'f': 102 | return '\u1DA0'; 103 | case 'g': 104 | return '\u1D4D'; 105 | case 'h': 106 | return '\u02B0'; 107 | case 'i': 108 | return '\u2071'; 109 | case 'j': 110 | return '\u02B2'; 111 | case 'k': 112 | return '\u1D4F'; 113 | case 'l': 114 | return '\u02E1'; 115 | case 'm': 116 | return '\u1D50'; 117 | case 'n': 118 | return '\u207F'; 119 | case 'o': 120 | return '\u1D52'; 121 | case 'p': 122 | return '\u1D56'; 123 | // there is no widely support Q subscript 124 | //case 'q': 125 | case 'r': 126 | return '\u02B3'; 127 | case 's': 128 | return '\u02E2'; 129 | case 't': 130 | return '\u1D57'; 131 | case 'u': 132 | return '\u1D58'; 133 | case 'v': 134 | return '\u1D5B'; 135 | case 'w': 136 | return '\u02B7'; 137 | case 'x': 138 | return '\u02E3'; 139 | case 'y': 140 | return '\u02B8'; 141 | case 'z': 142 | return '\u1DBB'; 143 | 144 | //uppercase 145 | case 'A': 146 | return '\u1D2C'; 147 | case 'B': 148 | return '\u1D2E'; 149 | case 'D': 150 | return '\u1D30'; 151 | case 'E': 152 | return '\u1D31'; 153 | case 'G': 154 | return '\u1D33'; 155 | case 'H': 156 | return '\u1D34'; 157 | case 'I': 158 | return '\u1D35'; 159 | case 'J': 160 | return '\u1D36'; 161 | case 'K': 162 | return '\u1D37'; 163 | case 'L': 164 | return '\u1D38'; 165 | case 'M': 166 | return '\u1D39'; 167 | case 'N': 168 | return '\u1D3A'; 169 | case 'O': 170 | return '\u1D3C'; 171 | case 'P': 172 | return '\u1D3E'; 173 | case 'R': 174 | return '\u1D3F'; 175 | case 'T': 176 | return '\u1D40'; 177 | case 'U': 178 | return '\u1D41'; 179 | case 'V': 180 | return '\u2C7D'; 181 | case 'W': 182 | return '\u1D42'; 183 | 184 | //greek 185 | case 'α': 186 | return '\u1D45'; 187 | case 'β': 188 | return '\u1D5D'; 189 | case 'γ': 190 | return '\u1D5E'; 191 | case 'δ': 192 | return '\u1D5F'; 193 | case '∊': 194 | return '\u1D4B'; 195 | case 'θ': 196 | return '\u1DBF'; 197 | case 'ι': 198 | return '\u1DA5'; 199 | case 'Φ': 200 | return '\u1DB2'; 201 | case 'φ': 202 | return '\u1D60'; 203 | case 'χ': 204 | return '\u1D61'; 205 | } 206 | IsFullyConverted = false; 207 | return c; 208 | } 209 | } 210 | 211 | -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/TextExtractor.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Text.RegularExpressions; 3 | using AngleSharp.Dom; 4 | using AngleSharp.Html.Dom; 5 | using Gemipedia.Models; 6 | 7 | namespace Gemipedia.Converter.Special; 8 | 9 | /// 10 | /// Extracts text 11 | /// 12 | public class TextExtractor : ITextContent 13 | { 14 | public string Content 15 | => ShouldCollapseNewlines ? 16 | CollapseNewlines(buffer.Content) : 17 | buffer.Content; 18 | 19 | public ArticleLinkCollection Links 20 | => buffer.Links; 21 | 22 | public bool ShouldCollapseNewlines { get; set; } = false; 23 | public bool ShouldConvertImages { get; set; } = false; 24 | 25 | //sets the character we use for newline replacement 26 | public string NewlineReplacement { get; set; } = " "; 27 | 28 | private static readonly Regex whitespace = new Regex(@"\s+", RegexOptions.Compiled); 29 | 30 | private Buffer buffer = new Buffer(); 31 | 32 | public void Extract(params INode[] nodes) 33 | => Extract(nodes.Where(x => x != null).FirstOrDefault()); 34 | 35 | public void Extract(INode current) 36 | { 37 | buffer.Reset(); 38 | if (current == null) 39 | { 40 | //nothing to do 41 | return; 42 | } 43 | ExtractInnerTextHelper(current); 44 | } 45 | 46 | private void ExtractInnerTextHelper(INode current) 47 | { 48 | switch (current.NodeType) 49 | { 50 | case NodeType.Text: 51 | //if its not only whitespace add it. 52 | if (current.TextContent.Trim().Length > 0) 53 | { 54 | buffer.Append(current.TextContent); 55 | } 56 | //if its whitepsace, but doesn't have a newline 57 | else if (!current.TextContent.Contains('\n')) 58 | { 59 | buffer.Append(current.TextContent); 60 | } 61 | break; 62 | 63 | case NodeType.Element: 64 | { 65 | HtmlElement element = current as HtmlElement; 66 | var nodeName = element?.NodeName.ToLower(); 67 | 68 | if (!HtmlParser.ShouldProcessElement(element, nodeName)) 69 | { 70 | return; 71 | } 72 | 73 | switch (nodeName) 74 | { 75 | case "a": 76 | Links.Add(element); 77 | ExtractChildrenText(current); 78 | break; 79 | 80 | case "br": 81 | buffer.AppendLine(); 82 | break; 83 | 84 | case "img": 85 | if (ShouldConvertImages) 86 | { 87 | buffer.Append(ConvertImage(element)); 88 | } 89 | break; 90 | 91 | default: 92 | if (HtmlParser.ShouldDisplayAsBlock(element)) 93 | { 94 | buffer.EnsureAtLineStart(); 95 | ExtractChildrenText(current); 96 | buffer.EnsureAtLineStart(); 97 | } 98 | else 99 | { 100 | ExtractChildrenText(current); 101 | } 102 | break; 103 | } 104 | } 105 | break; 106 | } 107 | } 108 | 109 | private void ExtractChildrenText(INode element) 110 | => element.ChildNodes.ToList().ForEach(x => ExtractInnerTextHelper(x)); 111 | 112 | //converts newlines to spaces. since that can create runs of whitespace, 113 | //remove those is they exist 114 | private string CollapseNewlines(string s) 115 | => CollapseSpaces(ConvertNewlines(s)); 116 | 117 | private string ConvertNewlines(string s) 118 | => s.Replace("\n", NewlineReplacement).Trim(); 119 | 120 | private string CollapseSpaces(string s) 121 | => whitespace.Replace(s, " "); 122 | 123 | private string ConvertImage(HtmlElement element) 124 | { 125 | var alt = element.GetAttribute("alt"); 126 | if (string.IsNullOrEmpty(alt)) 127 | { 128 | alt = element.GetAttribute("title"); 129 | } 130 | return !string.IsNullOrEmpty(alt) ? 131 | $"[Image: {alt}] " : 132 | ""; 133 | } 134 | } -------------------------------------------------------------------------------- /Gemipedia/API/WikipediaApiClient.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net; 4 | using CacheComms; 5 | using Gemipedia.API.Models; 6 | 7 | namespace Gemipedia.API; 8 | 9 | /// 10 | /// Wikipedia API client. Contacts the API and gets model objects back 11 | /// 12 | public class WikipediaApiClient 13 | { 14 | HttpRequestor Requestor; 15 | string Language; 16 | 17 | public long DownloadTimeMs => Requestor.DownloadTimeMs; 18 | 19 | public int DownloadSize => Requestor.BodySize ?? 0; 20 | 21 | public WikipediaApiClient(string lang = "en") 22 | { 23 | Requestor = new HttpRequestor(); 24 | Language = lang; 25 | } 26 | 27 | public List GeoSearch(double lat, double lon) 28 | { 29 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=query&format=json&list=geosearch&gscoord={lat}%7C{lon}&gsradius=5000&gslimit=100"); 30 | string json = FetchString(url); 31 | return ResponseParser.ParseGeoSearch(json); 32 | } 33 | 34 | //Gets the title of a random article 35 | public string GetRandomArticleTitle() 36 | { 37 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit=1"); 38 | string json = FetchString(url, false); 39 | return ResponseParser.ParseRandomArticle(json); 40 | } 41 | 42 | /// 43 | /// Gets an article 44 | /// 45 | /// 46 | /// 47 | public Article GetArticle(string title) 48 | { 49 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=parse&page={WebUtility.UrlEncode(title)}&prop=text&format=json"); 50 | string json = FetchString(url); 51 | return ResponseParser.ParseArticleResponse(json); 52 | } 53 | 54 | public FeaturedContent GetFeaturedContent() 55 | { 56 | //if you fetch the most popular content early in the day, there aren't any popular articles 57 | var url = new Uri($"https://{Language}.wikipedia.org/api/rest_v1/feed/featured/{DateTime.Now.ToString("yyyy/MM/dd")}"); 58 | //don't use the cace for this 59 | string json = FetchString(url); 60 | var featured = ResponseParser.ParseFeaturedContentResponse(json); 61 | 62 | if (featured.PopularArticles.Count == 0) 63 | { 64 | //fetch yesterdays 65 | var yesterday = DateTime.Now.Subtract(new TimeSpan(24, 0, 0)); 66 | //fetch yesterdays most popular articles 67 | url = new Uri($"https://{Language}.wikipedia.org/api/rest_v1/feed/featured/{yesterday.ToString("yyyy/MM/dd")}"); 68 | var oldFeatured = ResponseParser.ParseFeaturedContentResponse(FetchString(url)); 69 | featured.PopularArticles = oldFeatured.PopularArticles; 70 | } 71 | 72 | return featured; 73 | } 74 | 75 | public List GetOtherLanguages(string title) 76 | { 77 | //API wants whitespace encoded as underscores 78 | title = title.Replace(" ", "_"); 79 | var url = new Uri($"https://{Language}.wikipedia.org/w/rest.php/v1/page/{WebUtility.UrlEncode(title)}/links/language"); 80 | string json = FetchString(url); 81 | return ResponseParser.ParseOtherLanguagesResponse(json); 82 | } 83 | 84 | /// 85 | /// Performance a search using the "rest.php/v1/search/page" endpoint 86 | /// 87 | /// 88 | /// 89 | public List Search(string query) 90 | { 91 | var url = new Uri($"https://{Language}.wikipedia.org/w/rest.php/v1/search/page?q={WebUtility.UrlEncode(query)}&limit=25"); 92 | string json = FetchString(url); 93 | return ResponseParser.ParseSearchResponse(json); 94 | } 95 | 96 | //gets an image 97 | public byte[] GetMedia(string url) 98 | => FetchBytes(url); 99 | 100 | //Downloads a string, if its not already cached 101 | private string FetchString(Uri url, bool useCache = true) 102 | { 103 | var result = Requestor.GetAsString(url, useCache); 104 | if (!result) 105 | { 106 | return ""; 107 | } 108 | return Requestor.BodyText; 109 | } 110 | 111 | /// 112 | /// Fetchs the bytes for a URL. If it exists in the cache, it gets pulled 113 | /// otherwise a network request happens, and the results are cached 114 | /// 115 | /// 116 | /// 117 | private byte[] FetchBytes(string url, bool useCache = true) 118 | { 119 | var result = Requestor.GetAsBytes(new Uri(url), useCache); 120 | if (!result) 121 | { 122 | return null; 123 | } 124 | return Requestor.BodyBytes; 125 | } 126 | } -------------------------------------------------------------------------------- /Gemipedia/RouteOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Net; 4 | 5 | namespace Gemipedia; 6 | 7 | public static class RouteOptions 8 | { 9 | #region base URLs 10 | 11 | /// 12 | /// Base URL to use to view an article. Actual artical passed via query string 13 | /// 14 | public static string BaseArticleUrl { get; set; } 15 | 16 | public static string BaseFeaturedContenteUrl { get; set; } 17 | 18 | /// 19 | /// BaseURL to use to view geographic data. 20 | /// 21 | public static string BaseGeoUrl { get; set; } 22 | 23 | public static string BaseImageGallerUrl { get; set; } 24 | 25 | public static string BaseLanguageUrl { get; set; } 26 | 27 | public static string BaseLonLatUrl { get; set; } 28 | /// 29 | /// URL to use to proxy media. actual media path passed via query string 30 | /// 31 | public static string BaseMediaProxyUrl { get; set; } 32 | 33 | public static string BaseOtherLanguagesUrl { get; set; } 34 | 35 | public static string BaseRandomArticleUrl { get; set; } 36 | 37 | public static string BaseReferencesUrl { get; set; } 38 | 39 | public static string BaseSearchUrl { get; set; } 40 | 41 | public static string BaseSetLanguageUrl { get; set; } 42 | 43 | public static string BaseWelcomeUrl { get; set; } 44 | 45 | #endregion 46 | 47 | public static string ArticleUrl() 48 | => $"{AddLanguage(BaseArticleUrl)}"; 49 | 50 | public static string ArticleUrl(string title) 51 | => $"{AddLanguage(BaseArticleUrl)}?{WebUtility.UrlEncode(title)}"; 52 | 53 | public static string ArticleUrl(string title, string forceInLang) 54 | => $"{BaseArticleUrl}/{forceInLang}?{WebUtility.UrlEncode(title)}"; 55 | 56 | public static string FeaturedContent() 57 | => $"{AddLanguage(BaseFeaturedContenteUrl)}"; 58 | 59 | public static string GeoUrl(string geohackUrl) 60 | => $"{AddLanguage(BaseGeoUrl)}?{WebUtility.UrlEncode(geohackUrl)}"; 61 | 62 | public static string ImageGalleryUrl(string title) 63 | => $"{AddLanguage(BaseImageGallerUrl)}?{WebUtility.UrlEncode(title)}"; 64 | 65 | public static string LonLatUrl(double latitude, double longitude, string articleTitle) 66 | => $"{AddLanguage(BaseLonLatUrl)}?lat={latitude}&lon={longitude}&title={WebUtility.UrlEncode(articleTitle)}"; 67 | 68 | public static string MediaProxyUrl(string url) 69 | { 70 | //we need to have an extension on the filename of the media proxy URL, so clients 71 | //will render it as an inline image. Try and figure out what to use, but fall back 72 | //to a dummy "jpg" if nothing works 73 | string ext = ".jpg"; 74 | try 75 | { 76 | var uri = new Uri(url); 77 | ext = Path.GetExtension(uri.AbsolutePath); 78 | ext = String.IsNullOrEmpty(ext) ? ".jpg" : ext; 79 | } 80 | catch (Exception) 81 | { 82 | ext = ".jpg"; 83 | } 84 | return $"{BaseMediaProxyUrl}{ext}?{WebUtility.UrlEncode(url)}"; 85 | } 86 | 87 | public static string OtherLanguagesUrl(string title) 88 | => $"{AddLanguage(BaseOtherLanguagesUrl)}?{WebUtility.UrlEncode(title)}"; 89 | 90 | public static string PdfUrl(string escapedTitle) 91 | => $"https://{UserOptions.WikipediaVersion}.wikipedia.org/api/rest_v1/page/pdf/{WebUtility.UrlEncode(escapedTitle)}"; 92 | 93 | public static string RandomArticleUrl() 94 | => $"{AddLanguage(BaseRandomArticleUrl)}"; 95 | 96 | public static string ReferencesUrl(string title) 97 | => $"{AddLanguage(BaseReferencesUrl)}?name={WebUtility.UrlEncode(title)}"; 98 | 99 | public static string ReferencesUrl(string title, int sectionNum) 100 | => $"{AddLanguage(BaseReferencesUrl)}?name={WebUtility.UrlEncode(title)}§ion={sectionNum}"; 101 | 102 | public static string SearchUrl() 103 | => $"{AddLanguage(BaseSearchUrl)}"; 104 | 105 | public static string SearchUrl(string query) 106 | => $"{AddLanguage(BaseSearchUrl)}?{WebUtility.UrlEncode(query)}"; 107 | 108 | public static string SelectLanguageUrl() 109 | => $"{AddLanguage(BaseLanguageUrl)}"; 110 | 111 | public static string SetLanguageUrl() 112 | => BaseSetLanguageUrl; 113 | 114 | public static string WelcomeUrl() 115 | => $"{AddLanguage(BaseWelcomeUrl)}"; 116 | 117 | public static string WelcomeUrl(string forceLang) 118 | => $"{BaseWelcomeUrl}/{forceLang}"; 119 | 120 | public static string WikipediaSourceUrl(string escapedTitle) 121 | => $"https://{UserOptions.WikipediaVersion}.wikipedia.org/wiki/{WebUtility.UrlEncode(escapedTitle)}"; 122 | 123 | //if we can help it, avoid adding a language, since it increases the size of the URL 124 | //which can cause problems if we have to proxy something long 125 | private static string AddLanguage(string url) 126 | => (UserOptions.WikipediaVersion == "en") ? url : url + '/' + UserOptions.WikipediaVersion; 127 | } -------------------------------------------------------------------------------- /Gemipedia/API/ResponseParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Net; 5 | using System.Text.RegularExpressions; 6 | using Gemipedia.API.Models; 7 | using Newtonsoft.Json.Linq; 8 | 9 | namespace Gemipedia.API; 10 | 11 | /// 12 | /// Parses the JSON responses of the Wikipedia API into model objects 13 | /// 14 | public static class ResponseParser 15 | { 16 | public static Article ParseArticleResponse(string json) 17 | { 18 | var response = ParseJson(json); 19 | 20 | if (response["error"] != null) 21 | { 22 | //error loading page! 23 | return null; 24 | } 25 | 26 | return new Article 27 | { 28 | Title = Cleanse(response["parse"]["title"]), 29 | PageId = Convert.ToInt64(Cleanse(response["parse"]["pageid"])), 30 | HtmlText = Cleanse(response["parse"]["text"]["*"]), 31 | }; 32 | } 33 | 34 | public static List ParseGeoSearch(string json) 35 | { 36 | var response = ParseJson(json); 37 | List ret = new List(); 38 | 39 | if (response["query"] != null && response["query"]["geosearch"] != null) 40 | { 41 | //skip the first since that's the article we are on 42 | 43 | foreach (JObject result in (response["query"]["geosearch"] as JArray).Skip(1)) 44 | { 45 | ret.Add(new ArticleSummary 46 | { 47 | Title = Cleanse(result["title"]), 48 | Distance = (int)Math.Round(Convert.ToDouble(result["dist"]?.ToString() ?? "0")) 49 | }); 50 | } 51 | } 52 | 53 | return ret; 54 | } 55 | 56 | public static List ParseSearchResponse(string json) 57 | { 58 | var response = ParseJson(json); 59 | List ret = new List(); 60 | foreach (JObject result in (response["pages"] as JArray)) 61 | { 62 | ret.Add(new ArticleSummary 63 | { 64 | Title = StripNewlines(Cleanse(result["title"])), 65 | Excerpt = StripNewlines(StripHtml(Cleanse(result["excerpt"]))), 66 | Description = StripNewlines(Cleanse(result["description"])), 67 | ThumbnailUrl = GetThumbnailUrl(result["thumbnail"] as JObject) 68 | }); 69 | } 70 | return ret; 71 | } 72 | 73 | public static List ParseOtherLanguagesResponse(string json) 74 | { 75 | var response = JArray.Parse(json); 76 | List ret = new List(); 77 | foreach (JObject result in response) 78 | { 79 | ret.Add(new ArticleSummary 80 | { 81 | Title = Cleanse(result["title"]), 82 | LanguageCode = Cleanse(result["code"]) 83 | }); 84 | } 85 | return ret; 86 | } 87 | 88 | public static FeaturedContent ParseFeaturedContentResponse(string json) 89 | { 90 | var response = ParseJson(json); 91 | return new FeaturedContent 92 | { 93 | FeaturedArticle = ParseArticleSummary(response["tfa"] as JObject), 94 | PopularArticles = ParsePopularArticles(response["mostread"] as JObject) 95 | }; 96 | } 97 | 98 | public static string ParseRandomArticle(string json) 99 | { 100 | var response = ParseJson(json); 101 | return response["query"]["random"][0]["title"].Value(); 102 | } 103 | 104 | private static List ParsePopularArticles(JObject articles) 105 | { 106 | List ret = new List(); 107 | 108 | if (articles != null) 109 | { 110 | foreach (JObject article in (articles["articles"] as JArray).Take(25)) 111 | { 112 | ret.Add(ParseArticleSummary(article)); 113 | } 114 | } 115 | return ret; 116 | } 117 | 118 | private static ArticleSummary ParseArticleSummary(JObject summary) 119 | => (summary != null) ? 120 | new ArticleSummary 121 | { 122 | Title = StripNewlines(Cleanse(summary["normalizedtitle"])), 123 | Description = StripNewlines(Cleanse(summary["description"])), 124 | //already text formatted! 125 | Excerpt = StripNewlines(Cleanse(summary["extract"])), 126 | ThumbnailUrl = GetThumbnailUrl(summary["thumbnail"] as JObject) 127 | } : null; 128 | 129 | private static string GetThumbnailUrl(JObject thumb) 130 | { 131 | //result["thumbnail"]?["url"]? doesn't seem to work 132 | if (thumb != null) 133 | { 134 | var url = thumb["url"]?.ToString() ?? 135 | thumb["source"]?.ToString() ?? ""; 136 | if (url.Length > 0) 137 | { 138 | return CommonUtils.EnsureHttps(url); 139 | } 140 | } 141 | 142 | return ""; 143 | } 144 | 145 | private static string StripNewlines(string s) 146 | => s.Replace("\r\n", " ").Replace("\r", " ").Replace("\n", " ").Trim(); 147 | 148 | private static string Cleanse(JToken token) 149 | => token?.ToString() ?? ""; 150 | 151 | private static JObject ParseJson(string json) 152 | => JObject.Parse(json); 153 | 154 | private static string StripHtml(string s) 155 | => WebUtility.HtmlDecode(Regex.Replace(s, @"<[^>]*>", "")) + "..."; 156 | } 157 | 158 | -------------------------------------------------------------------------------- /Gemipedia/Renderer/ArticleRenderer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Linq; 4 | using Gemipedia.Models; 5 | 6 | namespace Gemipedia.Renderer; 7 | 8 | public class ArticleRenderer 9 | { 10 | TextWriter Writer; 11 | ParsedPage Page; 12 | int sectionID = 0; 13 | //grab and cache it 14 | string[] articleLinkSections = UserOptions.ArticleLinkSections; 15 | 16 | public void RenderArticle(ParsedPage parsedPage, TextWriter writer) 17 | { 18 | Writer = writer; 19 | Page = parsedPage; 20 | 21 | RenderArticleHeader(); 22 | foreach (var section in parsedPage.Sections) 23 | { 24 | Writer.Write(RenderSection(section)); 25 | } 26 | RenderArticleFooter(parsedPage); 27 | } 28 | 29 | private void RenderArticleHeader() 30 | { 31 | Writer.WriteLine($"# {Page.Title}"); 32 | int count = Page.GetAllImages().Count; 33 | if (count > 0) 34 | { 35 | Writer.WriteLine($"=> {RouteOptions.ImageGalleryUrl(Page.Title)} Gallery: {count} images"); 36 | } 37 | //TODO: Geo here! 38 | Writer.WriteLine($"=> {RouteOptions.SearchUrl(Page.Title)} Other articles that mention '{Page.Title}'"); 39 | Writer.WriteLine(); 40 | } 41 | 42 | private void RenderArticleFooter(ParsedPage parsedPage) 43 | { 44 | Writer.WriteLine(); 45 | Writer.WriteLine("## Article Resources"); 46 | Writer.WriteLine($"=> {RouteOptions.ReferencesUrl(Page.Title)} List of all {parsedPage.GetReferenceCount()} referenced articles"); 47 | Writer.WriteLine($"=> {RouteOptions.SearchUrl(Page.Title)} Search for articles that mention '{Page.Title}'"); 48 | Writer.WriteLine($"=> {RouteOptions.OtherLanguagesUrl(Page.Title)} Read this article in another language"); 49 | Writer.WriteLine($"=> {RouteOptions.PdfUrl(Page.EscapedTitle)} Download article PDF for offline access"); 50 | Writer.WriteLine($"=> {RouteOptions.WikipediaSourceUrl(Page.EscapedTitle)} Source on Wikipedia website"); 51 | } 52 | 53 | public void RenderInfobox(SimpleBuffer buffer, InfoboxItem infobox) 54 | { 55 | var title = string.IsNullOrEmpty(infobox.CustomTitle) 56 | ? "Quick Facts" : 57 | $"Quick Facts: {infobox.CustomTitle}"; 58 | 59 | buffer.EnsureAtLineStart(); 60 | buffer.AppendLine($"## {title}"); 61 | 62 | var navSuggestions = infobox.NavSuggestions; 63 | if (navSuggestions.Count() > 0) 64 | { 65 | //render navigation items at top 66 | foreach (var nav in navSuggestions) 67 | { 68 | ContentRenderer.RenderNavSuggestion(buffer, nav); 69 | } 70 | //add a blank link, since nav suggestion can be long 71 | buffer.AppendLine(); 72 | } 73 | 74 | foreach (var geo in infobox.GeoItems) 75 | { 76 | ContentRenderer.RenderGeo(buffer, geo); 77 | } 78 | 79 | foreach (var media in infobox.MediaItems) 80 | { 81 | ContentRenderer.RenderMedia(buffer, media as MediaItem); 82 | } 83 | 84 | buffer.EnsureAtLineStart(); 85 | foreach (var item in infobox.ContentItems) 86 | { 87 | buffer.Append(item.Content); 88 | } 89 | } 90 | 91 | public string RenderSection(Section section) 92 | { 93 | sectionID++; 94 | 95 | SimpleBuffer buffer = new SimpleBuffer(); 96 | if (section.HasNavSuggestions) 97 | { 98 | //render navigation items at top 99 | foreach (var nav in section.NavSuggestions) 100 | { 101 | ContentRenderer.RenderNavSuggestion(buffer, nav); 102 | } 103 | //add a blank link, since nav suggestion can be long 104 | buffer.AppendLine(); 105 | } 106 | 107 | foreach (var geo in section.GeoItems) 108 | { 109 | ContentRenderer.RenderGeo(buffer, geo); 110 | } 111 | 112 | //other content below, in order 113 | foreach (SectionItem item in section.GeneralContent) 114 | { 115 | if (item is MediaItem) 116 | { 117 | ContentRenderer.RenderMedia(buffer, item as MediaItem); 118 | } 119 | else if (item is ContentItem) 120 | { 121 | buffer.Append(((ContentItem)item).Content); 122 | } 123 | } 124 | foreach (var infoBox in section.Infoboxes) 125 | { 126 | RenderInfobox(buffer, infoBox); 127 | } 128 | 129 | if (section.Links.HasLinks && !ShouldExcludeSectionIndex(section)) 130 | { 131 | buffer.EnsureAtLineStart(); 132 | buffer.AppendLine($"=> {RouteOptions.ReferencesUrl(Page.Title, sectionID)} Section links: ({section.Links.Count} Articles)"); 133 | } 134 | 135 | foreach (var subSection in section.SubSections) 136 | { 137 | buffer.Append(RenderSection(subSection)); 138 | } 139 | 140 | //if a section has no content, don't write anything 141 | if (!buffer.HasContent) 142 | { 143 | return ""; 144 | } 145 | 146 | if (!section.IsSpecial) 147 | { 148 | if (section.SectionDepth == 2) 149 | { 150 | buffer.PrependLine($"## {section.Title}"); 151 | } 152 | else 153 | { 154 | //all other sections are at a level 3 155 | buffer.PrependLine($"### {section.Title}"); 156 | } 157 | } 158 | return buffer.Content; 159 | } 160 | 161 | private bool ShouldExcludeSectionIndex(Section section) 162 | => articleLinkSections.Contains(section.Title?.ToLower()); 163 | } 164 | -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/Tables/TableParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text.RegularExpressions; 5 | using AngleSharp.Html.Dom; 6 | using Gemipedia.Models; 7 | 8 | namespace Gemipedia.Converter.Special.Tables; 9 | 10 | public class TableParser : IArticleLinks 11 | { 12 | 13 | Row currRow; 14 | Table table; 15 | TextExtractor textExtractor; 16 | //used when adding row/colspans to fix mismatched tables 17 | int currRowWidth; 18 | 19 | public ArticleLinkCollection Links { get; private set; } 20 | 21 | public TableParser() 22 | { 23 | table = new Table(); 24 | textExtractor = new TextExtractor 25 | { 26 | ShouldConvertImages = true, 27 | ShouldCollapseNewlines = true 28 | }; 29 | Links = new ArticleLinkCollection(); 30 | } 31 | 32 | public Table ParseTable(HtmlElement element) 33 | { 34 | ParseChildren(element); 35 | AppendRow(); 36 | //go back and place any rowspan placeholder cells 37 | UpdateForRowSpans(); 38 | return table; 39 | } 40 | 41 | private void ParseChildren(HtmlElement element) 42 | => element.Children.ToList().ForEach(x => ParseTag((HtmlElement)x)); 43 | 44 | private void ParseTag(HtmlElement current) 45 | { 46 | 47 | switch (current.NodeName.ToLower()) 48 | { 49 | case "caption": 50 | textExtractor.Extract(current); 51 | table.Caption = textExtractor.Content; 52 | Links.Add(textExtractor); 53 | break; 54 | 55 | case "tr": 56 | { 57 | AppendRow(); 58 | currRow = new Row(); 59 | ParseChildren(current); 60 | break; 61 | } 62 | 63 | case "td": 64 | case "th": 65 | AddCell(current); 66 | break; 67 | 68 | //pass through 69 | case "tbody": 70 | case "tfoot": 71 | case "thead": 72 | ParseChildren(current); 73 | break; 74 | } 75 | } 76 | 77 | private void AppendRow() 78 | { 79 | if (currRow != null && !currRow.IsEmpty) 80 | { 81 | table.Rows.Add(currRow); 82 | } 83 | } 84 | 85 | private void AddCell(HtmlElement cell) 86 | { 87 | if (currRow != null) 88 | { 89 | textExtractor.Extract(cell); 90 | string contents = textExtractor.Content; 91 | Links.Add(textExtractor); 92 | 93 | currRow.Cells.Add(new Cell 94 | { 95 | IsHeader = (cell.NodeName == "TH"), 96 | Contents = contents, 97 | ColSpan = ParseSpan(cell.GetAttribute("colspan")), 98 | RowSpan = ParseSpan(cell.GetAttribute("rowspan")), 99 | IsRowSpanHolder = false 100 | }); 101 | } 102 | } 103 | 104 | //parse the value of a row or column span. Browsers are support liberal on this 105 | // "3;" works. Defaults to 1 if you can't parse anything 106 | private int ParseSpan(string attribValue) 107 | { 108 | try 109 | { 110 | if (attribValue != null) 111 | { 112 | var match = Regex.Match(attribValue, @"^(\d+)"); 113 | var value = match.Success ? Convert.ToInt32(match.Groups[1].Value) : 1; 114 | //colspan and rowspan must be >= 1 115 | return (value > 0) ? value : 1; 116 | } 117 | } 118 | catch (Exception) 119 | { } 120 | return 1; 121 | } 122 | 123 | private int RowWidthThrottle(int colSpan) 124 | { 125 | if (currRowWidth + colSpan <= table.MaxColumns) 126 | { 127 | currRowWidth += colSpan; 128 | return colSpan; 129 | } 130 | var newColspan = Math.Max((table.MaxColumns - currRowWidth), 1); 131 | currRowWidth += newColspan; 132 | return newColspan; 133 | } 134 | 135 | 136 | private void UpdateForRowSpans() 137 | { 138 | for (int rowIndex = 1; rowIndex < table.Rows.Count; rowIndex++) 139 | { 140 | List newRow = new List(); 141 | Queue oldRow = new Queue(table.Rows[rowIndex].Cells); 142 | Queue prevRow = new Queue(table.Rows[rowIndex - 1].Cells); 143 | currRowWidth = 0; 144 | while (prevRow.Count > 0) 145 | { 146 | var prevRowCell = prevRow.Dequeue(); 147 | 148 | if (prevRowCell.RowSpan > 1) 149 | { 150 | //push on a placeholder 151 | newRow.Add(new Cell 152 | { 153 | IsRowSpanHolder = true, 154 | RowSpan = prevRowCell.RowSpan - 1, 155 | ColSpan = RowWidthThrottle(prevRowCell.ColSpan), 156 | IsHeader = prevRowCell.IsHeader, 157 | }); 158 | } 159 | else 160 | { 161 | for (int i = 0; i < prevRowCell.ColSpan; i++) 162 | { 163 | //pull cell from current row == the colspan of 164 | if (oldRow.Count > 0) 165 | { 166 | var cell = oldRow.Dequeue(); 167 | cell.ColSpan = RowWidthThrottle(cell.ColSpan); 168 | newRow.Add(cell); 169 | i += cell.ColSpan - 1; 170 | } 171 | } 172 | } 173 | } 174 | //There should not be anything left in oldRow. If so, the 175 | //number of cells in the source table were mismatched, so try 176 | //and handle that 177 | while (oldRow.Count > 0) 178 | { 179 | newRow.Add(oldRow.Dequeue()); 180 | } 181 | table.Rows[rowIndex].Cells = newRow; 182 | } 183 | } 184 | 185 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/GeohackParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Specialized; 3 | using System.Text.RegularExpressions; 4 | using System.Web; 5 | 6 | namespace Gemipedia.Converter.Special; 7 | 8 | /// 9 | /// Parses the URLs used by Geohack.toolforge.org 10 | /// 11 | public class GeohackParser 12 | { 13 | public string ArticleName { get; private set; } 14 | 15 | public bool IsEarth 16 | => (Globe.ToLower() == "earth"); 17 | 18 | public bool IsValid { get; private set; } 19 | 20 | public string Globe { get; private set; } 21 | 22 | public string GeohackUrl { get; private set; } 23 | 24 | public string Language { get; private set; } 25 | 26 | public double Latitude { get; private set; } 27 | 28 | public double Longitude { get; private set; } 29 | 30 | public string Title { get; private set; } 31 | 32 | public string Type { get; private set; } 33 | 34 | public string Coordinates { get; private set; } 35 | 36 | public string GetPrettyName() 37 | => Title.Length > 0 ? Title : ArticleName; 38 | 39 | public bool HasTypeDescription 40 | => GetTypeDescription().Length > 0; 41 | 42 | public string GetTypeDescription() 43 | { 44 | switch (Type) 45 | { 46 | case "airport": 47 | case "city": 48 | case "country": 49 | case "event": 50 | case "forest": 51 | case "glacier": 52 | case "landmark": 53 | case "montain": 54 | case "river": 55 | case "satellite": 56 | case "state": 57 | return Type.Substring(0, 1).ToUpper() + Type.Substring(1); 58 | 59 | case "edu": 60 | return "Educational Institute"; 61 | 62 | case "railwaystation": 63 | return "Railway Station"; 64 | 65 | case "adm1st": 66 | case "adm2nd": 67 | case "adm3rd": 68 | return "Municipality"; 69 | 70 | case "waterbody": 71 | return "Body of water"; 72 | 73 | default: 74 | return ""; 75 | 76 | } 77 | } 78 | 79 | Regex DegreeMinuteSecondDirection = new Regex(@"([\d\.]+)_+(?:([\d\.]+)_)?(?:([\d\.]+)_+)?([NS])_([\d\.]+)_+(?:([\d\.]+)_+)?(?:([\d\.]+)_)?([EW])", RegexOptions.Compiled | RegexOptions.IgnoreCase); 80 | Regex DegreeDirection = new Regex(@"([\-\.\d]+)_([NS])_([\-\.\d]+)_([EW])", RegexOptions.Compiled | RegexOptions.IgnoreCase); 81 | 82 | NameValueCollection QueryString; 83 | 84 | string ParamString => QueryString["params"]; 85 | 86 | public GeohackParser(string geohackUrl) 87 | { 88 | if (!GeoParser.IsGeohackUrl(geohackUrl)) 89 | { 90 | throw new ArgumentException("Not a Geohack url"); 91 | } 92 | 93 | Uri url = new Uri(geohackUrl); 94 | GeohackUrl = url.AbsoluteUri; 95 | 96 | QueryString = HttpUtility.ParseQueryString(url.Query); 97 | 98 | IsValid = ParseLatLon(); 99 | ArticleName = ParseArticleName(); 100 | Globe = ExtractParam("globe") ?? "earth"; 101 | Language = QueryString["language"] ?? "en"; 102 | Title = QueryString["title"] ?? ""; 103 | Type = ExtractParam("type"); 104 | } 105 | 106 | private bool ParseLatLon() 107 | { 108 | if (DegreeMinuteSecondDirection.IsMatch(ParamString)) 109 | { 110 | ParseDMSD(ParamString); 111 | return true; 112 | } 113 | if (DegreeDirection.IsMatch(ParamString)) 114 | { 115 | ParseDD(ParamString); 116 | return true; 117 | } 118 | return false; 119 | } 120 | 121 | private string ParseArticleName() 122 | => QueryString["pagename"]?.Replace("_", " ") ?? ""; 123 | 124 | private double NormalizeDMS(Group g) 125 | { 126 | var val = g.ToString(); 127 | return val.Length > 0 ? Convert.ToDouble(val) : 0d; 128 | } 129 | 130 | private void ParseDMSD(string dms) 131 | { 132 | var match = DegreeMinuteSecondDirection.Match(dms); 133 | 134 | //DD = d + (min/60) + (sec/3600) 135 | Latitude = NormalizeDMS(match.Groups[1]) + 136 | NormalizeDMS(match.Groups[2]) / 60d + 137 | NormalizeDMS(match.Groups[3]) / 3600d; 138 | 139 | if (match.Groups[4].ToString().ToLower() == "s") 140 | { 141 | Latitude *= -1; 142 | } 143 | 144 | Longitude = NormalizeDMS(match.Groups[5]) + 145 | NormalizeDMS(match.Groups[6]) / 60d + 146 | NormalizeDMS(match.Groups[7]) / 3600d; 147 | 148 | if (match.Groups[8].ToString().ToLower() == "w") 149 | { 150 | Longitude *= -1; 151 | } 152 | 153 | Coordinates = string.Format("{0}{1}{2}{3} {4}{5}{6}{7}", 154 | FormatGroup(match.Groups[1], "°"), 155 | FormatGroup(match.Groups[2], "′"), 156 | FormatGroup(match.Groups[3], "″"), 157 | FormatGroup(match.Groups[4]), 158 | FormatGroup(match.Groups[5], "°"), 159 | FormatGroup(match.Groups[6], "′"), 160 | FormatGroup(match.Groups[7], "″"), 161 | FormatGroup(match.Groups[8])); 162 | } 163 | 164 | private string FormatGroup(Group g, string symbol = "") 165 | { 166 | var val = g.ToString(); 167 | return val.Length > 0 ? $"{val}{symbol}" : ""; 168 | } 169 | 170 | private void ParseDD(string dd) 171 | { 172 | var match = DegreeDirection.Match(dd); 173 | 174 | //DD = d + (min/60) + (sec/3600) 175 | Latitude = Convert.ToDouble(match.Groups[1].ToString()); 176 | 177 | if (match.Groups[2].ToString().ToLower() == "s") 178 | { 179 | Latitude *= -1; 180 | } 181 | 182 | Longitude = Convert.ToDouble(match.Groups[3].ToString()); 183 | if (match.Groups[4].ToString().ToLower() == "w") 184 | { 185 | Longitude *= -1; 186 | } 187 | Coordinates = string.Format("{0}°{1} {2}°{3}", 188 | match.Groups[1], match.Groups[2], 189 | match.Groups[3], match.Groups[4]); 190 | } 191 | 192 | private string ExtractParam(string paramName) 193 | { 194 | var match = Regex.Match(ParamString, @$"_?{paramName}\:([a-zA-Z0-9]+)_?"); 195 | if (match.Success && match.Groups.Count > 1) 196 | { 197 | return match.Groups[1].ToString(); 198 | } 199 | return null; 200 | } 201 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/Tables/TableRenderer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Gemipedia.Converter.Special.Tables; 6 | 7 | public class TableRenderer 8 | { 9 | int ColumnWidth = 0; 10 | 11 | Table Table; 12 | StringBuilder buffer; 13 | 14 | private TableRenderer(Table table) 15 | { 16 | Table = table; 17 | buffer = new StringBuilder(); 18 | } 19 | 20 | private string Render() 21 | { 22 | if (Table.HasCaption) 23 | { 24 | buffer.AppendLine($"### Table: {Table.Caption}"); 25 | } 26 | buffer.AppendLine("```Table"); 27 | buffer.AppendLine(GenerateDividerLine(Table.Rows[0], true)); 28 | 29 | for (int i = 0; i < Table.Rows.Count; i++) 30 | { 31 | var row = Table.Rows[i]; 32 | RenderRow(row); 33 | //are we on the last row? 34 | buffer.AppendLine(GenerateDividerLine(row, (i + 1) == Table.Rows.Count)); 35 | } 36 | buffer.AppendLine("```"); 37 | return buffer.ToString(); 38 | } 39 | 40 | private void RenderRow(Row row) 41 | { 42 | for (int lineNum = 0, max = row.LineHeight; lineNum < max; lineNum++) 43 | { 44 | StringBuilder lineBuffer = new StringBuilder(); 45 | for (int cellIndex = 0; cellIndex < row.Cells.Count; cellIndex++) 46 | { 47 | //leading edge 48 | if (cellIndex == 0) 49 | { 50 | lineBuffer.Append("|"); 51 | } 52 | lineBuffer.Append(row.Cells[cellIndex].FormattedLines[lineNum]); 53 | lineBuffer.Append("|"); 54 | } 55 | buffer.AppendLine(lineBuffer.ToString()); 56 | } 57 | } 58 | 59 | private string GenerateDividerLine(Row row, bool IsEdge = false) 60 | { 61 | StringBuilder sb = new StringBuilder(); 62 | sb.Append('+'); 63 | for (int i = 0; i < row.Cells.Count; i++) 64 | { 65 | //do we need to leave it open or draw a horizontal line? 66 | //for the top/bottom edges, we always draw the line 67 | var cell = row.Cells[i]; 68 | if (!IsEdge && cell.RowSpan > 1) 69 | { 70 | sb.Append(new string(' ', cell.FormattedWidth)); 71 | } 72 | else 73 | { 74 | sb.Append(new string('-', cell.FormattedWidth)); 75 | } 76 | //do we need to add some extra for the cells we skipped? 77 | sb.Append('+'); 78 | } 79 | return sb.ToString(); 80 | } 81 | 82 | private void FormatContents() 83 | { 84 | ColumnWidth = Math.Max((60 / Table.MaxColumns), 15); 85 | 86 | foreach (var row in Table.Rows) 87 | { 88 | 89 | foreach (var cell in row.Cells) 90 | { 91 | cell.FormattedLines = FormatCell(cell, ColumnWidth); 92 | } 93 | 94 | int maxHeight = row.LineHeight; 95 | 96 | foreach (var cell in row.Cells) 97 | { 98 | VerticalPad(cell, maxHeight, ColumnWidth); 99 | } 100 | } 101 | } 102 | 103 | private List FormatCell(Cell cell, int columnWidth) 104 | { 105 | //is this a rowspan placeholder? 106 | if (cell.IsRowSpanHolder) 107 | { 108 | return FormatPlaceholder(cell, columnWidth); 109 | } 110 | 111 | var input = cell.IsHeader ? cell.Contents.ToUpper() : cell.Contents; 112 | int maxWidth = (columnWidth * cell.ColSpan) + (cell.ColSpan - 1); 113 | 114 | List lines = new List(); 115 | 116 | string[] words = input.Split(' '); 117 | 118 | string line = ""; 119 | int lineLength = 0; 120 | foreach (string word in words) 121 | { 122 | 123 | int wordLength = UnicodeString.GetWidth(word); 124 | //do we have extra-wide characters? 125 | bool hasWideCharacters = (wordLength != word.Length); 126 | //can the word fit? 127 | if (wordLength > maxWidth) 128 | { 129 | //nope, we are going to need to hard slice this word to fit to the width 130 | //this is complex if we have wide characters 131 | 132 | //Step 1: flush anything still in the buffer 133 | if (lineLength > 0) 134 | { 135 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader)); 136 | line = ""; 137 | lineLength = 0; 138 | } 139 | 140 | //step 2: determine the amount of characters to use in each hard slice 141 | int substringLength = maxWidth; 142 | if (hasWideCharacters && word.Length < maxWidth) 143 | { 144 | //if we have wide characters, we need to do a smaller 145 | substringLength = word.Length / 2; 146 | } 147 | 148 | int start = 0; 149 | while (start < word.Length) 150 | { 151 | lines.Add(PadCell(word.Substring(start, Math.Min(substringLength, word.Length - start)), maxWidth, cell.IsHeader)); 152 | start += substringLength; 153 | } 154 | continue; 155 | } 156 | //will the buffer be too big? if so, flush it 157 | if ((lineLength + wordLength) > maxWidth) 158 | { 159 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader)); 160 | line = ""; 161 | lineLength = 0; 162 | } 163 | line += word; 164 | lineLength += wordLength; 165 | if (wordLength + 1 <= maxWidth) 166 | { 167 | line += " "; 168 | lineLength += 1; 169 | } 170 | } 171 | //flush any remaining in buffer 172 | if (lineLength > 0) 173 | { 174 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader)); 175 | } 176 | return lines; 177 | } 178 | 179 | private List FormatPlaceholder(Cell cell, int columWidth) 180 | { 181 | int maxWidth = (columWidth * cell.ColSpan) + (cell.ColSpan - 1); 182 | var ret = new List(); 183 | ret.Add(new string(' ', maxWidth)); 184 | return ret; 185 | } 186 | 187 | private string PadCell(string s, int length, bool center) 188 | { 189 | int counter = 0; 190 | int initialLength = UnicodeString.GetWidth(s); 191 | int addedLength = 0; 192 | for (; initialLength + addedLength < length;) 193 | { 194 | counter++; 195 | if (center && counter % 2 == 1) 196 | { 197 | s = " " + s; 198 | } 199 | else 200 | { 201 | s += " "; 202 | } 203 | addedLength++; 204 | } 205 | return s; 206 | } 207 | 208 | private void VerticalPad(Cell cell, int lines, int width) 209 | { 210 | int maxWidth = (width * cell.ColSpan) + (cell.ColSpan - 1); 211 | for (; cell.FormattedLines.Count < lines;) 212 | { 213 | cell.FormattedLines.Add(new string(' ', maxWidth)); 214 | } 215 | } 216 | 217 | public static string RenderTable(Table Table) 218 | { 219 | if (Table.IsEmpty) 220 | { 221 | return ""; 222 | } 223 | var renderer = new TableRenderer(Table); 224 | renderer.FormatContents(); 225 | return renderer.Render(); 226 | } 227 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Sectionizer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Xml.Linq; 5 | using AngleSharp.Dom; 6 | using AngleSharp.Html.Dom; 7 | using Gemipedia.Models; 8 | 9 | namespace Gemipedia.Converter; 10 | 11 | /// 12 | /// Constructs a tree of sections and subsections for the Wiki content 13 | /// 14 | public class Sectionizer 15 | { 16 | Stack
SectionStack; 17 | 18 | ParsedPage ParsedPage; 19 | //grab once and cache 20 | string[] excludedSections = UserOptions.ExcludedSections; 21 | 22 | public ParsedPage ParseContent(string title, INode contentRoot) 23 | { 24 | ParsedPage = new ParsedPage 25 | { 26 | Title = title 27 | }; 28 | 29 | SectionStack = new Stack
(); 30 | 31 | SectionStack.Push(new Section 32 | { 33 | IsSpecial = true, 34 | SectionDepth = 2, 35 | }); 36 | 37 | var nodeList = contentRoot.ChildNodes.ToArray(); 38 | 39 | for (int currIndex = 0, len = nodeList.Length; currIndex < len; currIndex++) 40 | { 41 | INode currNode = contentRoot.ChildNodes[currIndex]; 42 | 43 | HeadingInfo? headingInfo = GetIfHeading(currNode); 44 | 45 | //is it a normal node 46 | if (headingInfo != null) 47 | { 48 | 49 | //we are supposed to skip this? 50 | if (ShouldSkipSection(headingInfo)) 51 | { 52 | currIndex = FastForward(currNode, nodeList, currIndex); 53 | continue; 54 | } 55 | 56 | int depthOnStack = SectionStack.Peek().SectionDepth; 57 | //normalize to H2 58 | if (headingInfo.Level < 2) 59 | { 60 | headingInfo.Level = 2; 61 | } 62 | if (headingInfo.Level > depthOnStack) 63 | { 64 | //ok push a new section 65 | PushNewSection(headingInfo); 66 | continue; 67 | } 68 | else if (headingInfo.Level == depthOnStack) 69 | { 70 | //pop the current section off 71 | AddCompletedSection(SectionStack.Pop()); 72 | //push the new section 73 | PushNewSection(headingInfo); 74 | } 75 | else 76 | { 77 | //new section is 78 | //found one lower! 79 | //while the top of ths stacck is > the next one 80 | while (SectionStack.Peek().SectionDepth > headingInfo.Level) 81 | { 82 | var tmpSection = SectionStack.Pop(); 83 | //add that as a subsection for the section of the top 84 | SectionStack.Peek().SubSections.Add(tmpSection); 85 | } 86 | //pop the current section off 87 | AddCompletedSection(SectionStack.Pop()); 88 | //push the new section 89 | PushNewSection(headingInfo); 90 | } 91 | } 92 | else if (ShouldAddNode(currNode)) 93 | { 94 | SectionStack.Peek().Nodes.Add(currNode); 95 | } 96 | } 97 | //combine remain stack 98 | while (SectionStack.Count > 0) 99 | { 100 | AddCompletedSection(SectionStack.Pop()); 101 | } 102 | return ParsedPage; 103 | } 104 | 105 | private void AddCompletedSection(Section section) 106 | { 107 | //if there is still something on the stack, add it as a subsection 108 | if (SectionStack.Count > 0) 109 | { 110 | SectionStack.Peek().SubSections.Add(section); 111 | } 112 | else 113 | { 114 | ParsedPage.Sections.Add(section); 115 | } 116 | } 117 | 118 | private HeadingInfo? GetIfHeading(INode node) 119 | { 120 | 121 | if (node is not HtmlElement) 122 | { 123 | return null; 124 | } 125 | 126 | var htmlElement = node as HtmlElement; 127 | 128 | if (htmlElement.NodeName.Length == 2 && 129 | htmlElement.NodeName[0] == 'H' && 130 | char.IsDigit(htmlElement.NodeName[1])) 131 | { 132 | //traditional HTML used for a heading 133 | return new HeadingInfo 134 | { 135 | ID = htmlElement.QuerySelector("span.mw-headline").GetAttribute("id")?.ToLower() ?? "", 136 | Level = node.NodeName[1] - 48, 137 | Title = htmlElement.QuerySelector("span.mw-headline").TextContent.Trim().Replace("\n", "") 138 | }; 139 | } 140 | //2024-07-21 : Sometime recently MediaWiki started output HTML with the header tags 141 | //wrapped in DIVs 142 | //TODO: I really should junk all this and operate on the WikiText directly... 143 | else if (htmlElement.NodeName == "DIV" && 144 | htmlElement.ClassName != null && 145 | htmlElement.ClassName.Contains("mw-heading") && 146 | htmlElement.FirstElementChild != null && 147 | htmlElement.FirstElementChild.NodeName.Length == 2 && 148 | htmlElement.FirstElementChild.NodeName[0] == 'H' && 149 | char.IsDigit(htmlElement.FirstElementChild.NodeName[1])) 150 | { 151 | //modern header 152 | 153 | return new HeadingInfo 154 | { 155 | ID = htmlElement.FirstElementChild.GetAttribute("id")?.ToLower() ?? "", 156 | Level = htmlElement.FirstElementChild.NodeName[1] - 48, 157 | Title = htmlElement.FirstElementChild.TextContent.Trim().Replace("\n", "") 158 | }; 159 | } 160 | return null; 161 | } 162 | 163 | private void PushNewSection(HeadingInfo headingInfo) 164 | => SectionStack.Push(new Section 165 | { 166 | Title = headingInfo.Title, 167 | SectionDepth = headingInfo.Level 168 | }); 169 | 170 | 171 | private bool ShouldAddNode(INode node) 172 | { 173 | switch (node.NodeType) 174 | { 175 | case NodeType.Text: 176 | if (node.TextContent.Trim().Length == 0) 177 | { 178 | return false; 179 | } 180 | return true; 181 | 182 | case NodeType.Element: 183 | return true; 184 | 185 | default: 186 | return false; 187 | } 188 | } 189 | 190 | private bool ShouldSkipSection(HeadingInfo headingInfo) 191 | => excludedSections.Contains(headingInfo.ID); 192 | 193 | /// 194 | /// Fast forwards to the next element of the type as the provided element 195 | /// 196 | /// 197 | /// 198 | /// 199 | private int FastForward(INode element, INode[] nodeList, int currentIndex) 200 | { 201 | int skipIndex = currentIndex + 1; 202 | //fast forward until we get to the next section 203 | for (; skipIndex < nodeList.Length; skipIndex++) 204 | { 205 | if ((nodeList[skipIndex].NodeType == element.NodeType) && (nodeList[skipIndex]).NodeName == element.NodeName) 206 | { 207 | break; 208 | } 209 | } 210 | return skipIndex - 1; 211 | } 212 | } 213 | 214 | internal class HeadingInfo 215 | { 216 | public string Title { get; set; } 217 | public string ID { get; set; } 218 | public int Level { get; set; } 219 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/MediaParser.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using AngleSharp.Dom; 4 | using Gemipedia.Models; 5 | 6 | namespace Gemipedia.Converter.Special; 7 | 8 | /// 9 | /// Converts the various image widgets 10 | /// 11 | public static class MediaParser 12 | { 13 | static int montageNumber = 1; 14 | static int galleryNumber = 1; 15 | 16 | static TextExtractor textExtractor = new TextExtractor 17 | { 18 | ShouldCollapseNewlines = true 19 | }; 20 | 21 | public static MediaItem ConvertMedia(IElement imageContainer, IElement captionContainer) 22 | => IsVideo(imageContainer) ? 23 | ConvertVideo(imageContainer, captionContainer) : 24 | ConvertImage(imageContainer, captionContainer); 25 | 26 | public static MediaItem ConvertTimelineInTable(IElement element) 27 | { 28 | var timeline = element.QuerySelector("div.timeline-wrapper"); 29 | if (timeline != null) 30 | { 31 | //attempt to get a meaningful title for the timeline from the first cell 32 | textExtractor.Extract(element.QuerySelector("th"), element.QuerySelector("td")); 33 | 34 | return ConvertTimeline(timeline, textExtractor); 35 | } 36 | return null; 37 | } 38 | 39 | public static MediaItem ConvertTimeline(IElement timelineWrapper, ITextContent textContent = null) 40 | { 41 | var img = timelineWrapper.QuerySelector("img[usemap]"); 42 | var title = (textContent != null) ? $"Timeline Image: {textContent.Content}" : "Timeline Image"; 43 | 44 | if (img != null) 45 | { 46 | var media = new MediaItem 47 | { 48 | Url = RouteOptions.MediaProxyUrl(CommonUtils.GetImageUrl(img)), 49 | Caption = title 50 | }; 51 | //add anything from 52 | if (textContent != null) 53 | { 54 | media.Links.Add(textContent.Links); 55 | } 56 | //try and add links from any areas to it 57 | timelineWrapper.QuerySelectorAll("map area") 58 | .ToList().ForEach(x => media.Links.Add(x)); 59 | 60 | return media; 61 | 62 | } 63 | return null; 64 | } 65 | 66 | public static IEnumerable ConvertGallery(IElement gallery) 67 | { 68 | List ret = new List(); 69 | int imageNumber = 0; 70 | foreach (var galleryItem in gallery.QuerySelectorAll("li.gallerybox")) 71 | { 72 | imageNumber++; 73 | var media = ConvertImage(galleryItem, galleryItem.QuerySelector(".gallerytext")); 74 | if (media != null) 75 | { 76 | //prefix it 77 | media.Caption = $"Gallery {galleryNumber}, Image {imageNumber}: {media.Caption}"; 78 | ret.Add(media); 79 | } 80 | } 81 | galleryNumber++; 82 | return ret; 83 | } 84 | 85 | private static MediaItem ConvertImage(IElement imageContainer, IElement? captionContainer, string defaultText = "Article Image") 86 | { 87 | //some image holders can contain graphs, charts, etc. So escape if you don't find an img 88 | var imgTag = imageContainer.QuerySelector("img"); 89 | if (imgTag == null) 90 | { 91 | return null; 92 | } 93 | var url = CommonUtils.GetImageUrl(imgTag); 94 | if (url == null) 95 | { 96 | return null; 97 | } 98 | 99 | var description = GetImageDescrption(imageContainer, captionContainer, defaultText); 100 | var media = new MediaItem 101 | { 102 | Links = textExtractor.Links, 103 | Caption = description, 104 | Url = RouteOptions.MediaProxyUrl(url), 105 | }; 106 | //if this is an image map, extract those links too 107 | if (imgTag.HasAttribute("usemap")) 108 | { 109 | //look for any maps 110 | //try and add links from any areas to it 111 | imageContainer.QuerySelectorAll("map area") 112 | .ToList().ForEach(x => media.Links.Add(x)); 113 | } 114 | return media; 115 | } 116 | 117 | public static IEnumerable ConvertMontage(IElement tmulti, IElement? captionContainer = null) 118 | { 119 | List ret = new List(); 120 | 121 | int imageNumber = 0; 122 | foreach (var thumb in tmulti.QuerySelectorAll(".thumbimage")) 123 | { 124 | imageNumber++; 125 | var media = ConvertImage(thumb, captionContainer); 126 | if (media != null) 127 | { 128 | //prefix it 129 | media.Caption = $"Montage {montageNumber}, Image {imageNumber}: {media.Caption}"; 130 | ret.Add(media); 131 | } 132 | } 133 | montageNumber++; 134 | return ret; 135 | } 136 | 137 | private static MediaItem ConvertVideo(IElement imageContainer, IElement captionContainer) 138 | { 139 | var videoElement = ParseVideo(imageContainer); 140 | 141 | string imageUrl = GetPosterUrl(videoElement); 142 | string videoUrl = GetVideoUrl(videoElement); 143 | if (imageUrl == null || videoUrl == null) 144 | { 145 | return null; 146 | } 147 | 148 | var description = GetImageDescrption(imageContainer, captionContainer); 149 | 150 | return new VideoItem 151 | { 152 | Links = textExtractor.Links, 153 | Caption = description, 154 | Url = RouteOptions.MediaProxyUrl(imageUrl), 155 | VideoUrl = videoUrl, 156 | VideoDescription = GetVideoDescription(videoElement) 157 | }; 158 | } 159 | 160 | private static IElement ParseVideo(IElement imageContainer) 161 | => imageContainer.QuerySelector("video"); 162 | 163 | private static bool IsVideo(IElement imageContainer) 164 | => (imageContainer.QuerySelector("video") != null); 165 | 166 | private static string GetPosterUrl(IElement videoElement) 167 | => CommonUtils.EnsureHttps(videoElement?.GetAttribute("poster") ?? null); 168 | 169 | private static string GetVideoUrl(IElement videoElement) 170 | => CommonUtils.EnsureHttps(videoElement?.QuerySelector("source").GetAttribute("src") ?? null); 171 | 172 | private static string GetVideoDescription(IElement videoElement) 173 | => "🎦 " + (videoElement?.QuerySelector("source").GetAttribute("data-title") ?? "Video File"); 174 | 175 | /// 176 | /// Attempts to get an image's description using a caption element, alt text, or a default string 177 | /// 178 | /// 179 | /// 180 | /// 181 | /// 182 | private static string GetImageDescrption(IElement imageContainer, IElement? captionContainer, string defaultText = "Article Image") 183 | { 184 | if (captionContainer != null) 185 | { 186 | //first see if there is a caption 187 | textExtractor.Extract(captionContainer); 188 | string text = textExtractor.Content; 189 | if (text.Length > 0) 190 | { 191 | return text; 192 | } 193 | } 194 | //fall back to the ALT text 195 | string description = GetImageAlt(imageContainer); 196 | return (description.Length > 0) ? description : defaultText; 197 | } 198 | 199 | private static string GetImageAlt(IElement element) 200 | => StripImageExtensions(element.QuerySelector("img")?.GetAttribute("alt") ?? ""); 201 | 202 | //For some alt text, sometimes the filename is used, so strip off any trailing extension to improve readability 203 | private static string StripImageExtensions(string alt) 204 | { 205 | alt = StripExtension(alt, "jpeg"); 206 | alt = StripExtension(alt, "jpg"); 207 | alt = StripExtension(alt, "png"); 208 | alt = StripExtension(alt, "gif"); 209 | alt = StripExtension(alt, "svg"); 210 | return alt; 211 | } 212 | 213 | private static string StripExtension(string alt, string ext) 214 | => (alt.Length > (ext.Length) + 1 && 215 | alt.EndsWith($".{ext}")) ? alt.Substring(0, alt.Length - (ext.Length) - 1) : alt; 216 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # globs 2 | Makefile.in 3 | *.userprefs 4 | *.usertasks 5 | config.make 6 | config.status 7 | aclocal.m4 8 | install-sh 9 | autom4te.cache/ 10 | *.tar.gz 11 | tarballs/ 12 | test-results/ 13 | 14 | # Mac bundle stuff 15 | *.dmg 16 | *.app 17 | 18 | # content below from: https://github.com/github/gitignore/blob/main/Global/macOS.gitignore 19 | # General 20 | .DS_Store 21 | .AppleDouble 22 | .LSOverride 23 | 24 | # Icon must end with two \r 25 | Icon 26 | 27 | 28 | # Thumbnails 29 | ._* 30 | 31 | # Files that might appear in the root of a volume 32 | .DocumentRevisions-V100 33 | .fseventsd 34 | .Spotlight-V100 35 | .TemporaryItems 36 | .Trashes 37 | .VolumeIcon.icns 38 | .com.apple.timemachine.donotpresent 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | 47 | # content below from: https://github.com/github/gitignore/blob/main/Global/Windows.gitignore 48 | # Windows thumbnail cache files 49 | Thumbs.db 50 | ehthumbs.db 51 | ehthumbs_vista.db 52 | 53 | # Dump file 54 | *.stackdump 55 | 56 | # Folder config file 57 | [Dd]esktop.ini 58 | 59 | # Recycle Bin used on file shares 60 | $RECYCLE.BIN/ 61 | 62 | # Windows Installer files 63 | *.cab 64 | *.msi 65 | *.msix 66 | *.msm 67 | *.msp 68 | 69 | # Windows shortcuts 70 | *.lnk 71 | 72 | # content below from: https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 73 | ## Ignore Visual Studio temporary files, build results, and 74 | ## files generated by popular Visual Studio add-ons. 75 | ## 76 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 77 | 78 | # User-specific files 79 | *.suo 80 | *.user 81 | *.userosscache 82 | *.sln.docstates 83 | 84 | # User-specific files (MonoDevelop/Xamarin Studio) 85 | *.userprefs 86 | 87 | # Build results 88 | [Dd]ebug/ 89 | [Dd]ebugPublic/ 90 | [Rr]elease/ 91 | [Rr]eleases/ 92 | x64/ 93 | x86/ 94 | bld/ 95 | [Bb]in/ 96 | [Oo]bj/ 97 | [Ll]og/ 98 | 99 | # Visual Studio 2015/2017 cache/options directory 100 | .vs/ 101 | # Uncomment if you have tasks that create the project's static files in wwwroot 102 | #wwwroot/ 103 | 104 | # Visual Studio 2017 auto generated files 105 | Generated\ Files/ 106 | 107 | # MSTest test Results 108 | [Tt]est[Rr]esult*/ 109 | [Bb]uild[Ll]og.* 110 | 111 | # NUNIT 112 | *.VisualState.xml 113 | TestResult.xml 114 | 115 | # Build Results of an ATL Project 116 | [Dd]ebugPS/ 117 | [Rr]eleasePS/ 118 | dlldata.c 119 | 120 | # Benchmark Results 121 | BenchmarkDotNet.Artifacts/ 122 | 123 | # .NET Core 124 | project.lock.json 125 | project.fragment.lock.json 126 | artifacts/ 127 | 128 | # StyleCop 129 | StyleCopReport.xml 130 | 131 | # Files built by Visual Studio 132 | *_i.c 133 | *_p.c 134 | *_h.h 135 | *.ilk 136 | *.meta 137 | *.obj 138 | *.iobj 139 | *.pch 140 | *.pdb 141 | *.ipdb 142 | *.pgc 143 | *.pgd 144 | *.rsp 145 | *.sbr 146 | *.tlb 147 | *.tli 148 | *.tlh 149 | *.tmp 150 | *.tmp_proj 151 | *_wpftmp.csproj 152 | *.log 153 | *.vspscc 154 | *.vssscc 155 | .builds 156 | *.pidb 157 | *.svclog 158 | *.scc 159 | 160 | # Chutzpah Test files 161 | _Chutzpah* 162 | 163 | # Visual C++ cache files 164 | ipch/ 165 | *.aps 166 | *.ncb 167 | *.opendb 168 | *.opensdf 169 | *.sdf 170 | *.cachefile 171 | *.VC.db 172 | *.VC.VC.opendb 173 | 174 | # Visual Studio profiler 175 | *.psess 176 | *.vsp 177 | *.vspx 178 | *.sap 179 | 180 | # Visual Studio Trace Files 181 | *.e2e 182 | 183 | # TFS 2012 Local Workspace 184 | $tf/ 185 | 186 | # Guidance Automation Toolkit 187 | *.gpState 188 | 189 | # ReSharper is a .NET coding add-in 190 | _ReSharper*/ 191 | *.[Rr]e[Ss]harper 192 | *.DotSettings.user 193 | 194 | # JustCode is a .NET coding add-in 195 | .JustCode 196 | 197 | # TeamCity is a build add-in 198 | _TeamCity* 199 | 200 | # DotCover is a Code Coverage Tool 201 | *.dotCover 202 | 203 | # AxoCover is a Code Coverage Tool 204 | .axoCover/* 205 | !.axoCover/settings.json 206 | 207 | # Visual Studio code coverage results 208 | *.coverage 209 | *.coveragexml 210 | 211 | # NCrunch 212 | _NCrunch_* 213 | .*crunch*.local.xml 214 | nCrunchTemp_* 215 | 216 | # MightyMoose 217 | *.mm.* 218 | AutoTest.Net/ 219 | 220 | # Web workbench (sass) 221 | .sass-cache/ 222 | 223 | # Installshield output folder 224 | [Ee]xpress/ 225 | 226 | # DocProject is a documentation generator add-in 227 | DocProject/buildhelp/ 228 | DocProject/Help/*.HxT 229 | DocProject/Help/*.HxC 230 | DocProject/Help/*.hhc 231 | DocProject/Help/*.hhk 232 | DocProject/Help/*.hhp 233 | DocProject/Help/Html2 234 | DocProject/Help/html 235 | 236 | # Click-Once directory 237 | publish/ 238 | 239 | # Publish Web Output 240 | *.[Pp]ublish.xml 241 | *.azurePubxml 242 | # Note: Comment the next line if you want to checkin your web deploy settings, 243 | # but database connection strings (with potential passwords) will be unencrypted 244 | *.pubxml 245 | *.publishproj 246 | 247 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 248 | # checkin your Azure Web App publish settings, but sensitive information contained 249 | # in these scripts will be unencrypted 250 | PublishScripts/ 251 | 252 | # NuGet Packages 253 | *.nupkg 254 | # The packages folder can be ignored because of Package Restore 255 | **/[Pp]ackages/* 256 | # except build/, which is used as an MSBuild target. 257 | !**/[Pp]ackages/build/ 258 | # Uncomment if necessary however generally it will be regenerated when needed 259 | #!**/[Pp]ackages/repositories.config 260 | # NuGet v3's project.json files produces more ignorable files 261 | *.nuget.props 262 | *.nuget.targets 263 | 264 | # Microsoft Azure Build Output 265 | csx/ 266 | *.build.csdef 267 | 268 | # Microsoft Azure Emulator 269 | ecf/ 270 | rcf/ 271 | 272 | # Windows Store app package directories and files 273 | AppPackages/ 274 | BundleArtifacts/ 275 | Package.StoreAssociation.xml 276 | _pkginfo.txt 277 | *.appx 278 | 279 | # Visual Studio cache files 280 | # files ending in .cache can be ignored 281 | *.[Cc]ache 282 | # but keep track of directories ending in .cache 283 | !*.[Cc]ache/ 284 | 285 | # Others 286 | ClientBin/ 287 | ~$* 288 | *~ 289 | *.dbmdl 290 | *.dbproj.schemaview 291 | *.jfm 292 | *.pfx 293 | *.publishsettings 294 | orleans.codegen.cs 295 | 296 | # Including strong name files can present a security risk 297 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 298 | #*.snk 299 | 300 | # Since there are multiple workflows, uncomment next line to ignore bower_components 301 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 302 | #bower_components/ 303 | 304 | # RIA/Silverlight projects 305 | Generated_Code/ 306 | 307 | # Backup & report files from converting an old project file 308 | # to a newer Visual Studio version. Backup files are not needed, 309 | # because we have git ;-) 310 | _UpgradeReport_Files/ 311 | Backup*/ 312 | UpgradeLog*.XML 313 | UpgradeLog*.htm 314 | ServiceFabricBackup/ 315 | *.rptproj.bak 316 | 317 | # SQL Server files 318 | *.mdf 319 | *.ldf 320 | *.ndf 321 | 322 | # Business Intelligence projects 323 | *.rdl.data 324 | *.bim.layout 325 | *.bim_*.settings 326 | *.rptproj.rsuser 327 | 328 | # Microsoft Fakes 329 | FakesAssemblies/ 330 | 331 | # GhostDoc plugin setting file 332 | *.GhostDoc.xml 333 | 334 | # Node.js Tools for Visual Studio 335 | .ntvs_analysis.dat 336 | node_modules/ 337 | 338 | # Visual Studio 6 build log 339 | *.plg 340 | 341 | # Visual Studio 6 workspace options file 342 | *.opt 343 | 344 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 345 | *.vbw 346 | 347 | # Visual Studio LightSwitch build output 348 | **/*.HTMLClient/GeneratedArtifacts 349 | **/*.DesktopClient/GeneratedArtifacts 350 | **/*.DesktopClient/ModelManifest.xml 351 | **/*.Server/GeneratedArtifacts 352 | **/*.Server/ModelManifest.xml 353 | _Pvt_Extensions 354 | 355 | # Paket dependency manager 356 | .paket/paket.exe 357 | paket-files/ 358 | 359 | # FAKE - F# Make 360 | .fake/ 361 | 362 | # JetBrains Rider 363 | .idea/ 364 | *.sln.iml 365 | 366 | # CodeRush personal settings 367 | .cr/personal 368 | 369 | # Python Tools for Visual Studio (PTVS) 370 | __pycache__/ 371 | *.pyc 372 | 373 | # Cake - Uncomment if you are using it 374 | # tools/** 375 | # !tools/packages.config 376 | 377 | # Tabs Studio 378 | *.tss 379 | 380 | # Telerik's JustMock configuration file 381 | *.jmconfig 382 | 383 | # BizTalk build output 384 | *.btp.cs 385 | *.btm.cs 386 | *.odx.cs 387 | *.xsd.cs 388 | 389 | # OpenCover UI analysis results 390 | OpenCover/ 391 | 392 | # Azure Stream Analytics local run output 393 | ASALocalRun/ 394 | 395 | # MSBuild Binary and Structured Log 396 | *.binlog 397 | 398 | # NVidia Nsight GPU debugger configuration file 399 | *.nvuser 400 | 401 | # MFractors (Xamarin productivity tool) working folder 402 | .mfractor/ 403 | 404 | # Local History for Visual Studio 405 | .localhistory/ -------------------------------------------------------------------------------- /Gemipedia/Converter/Special/InfoboxParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using AngleSharp.Dom; 5 | using AngleSharp.Html.Dom; 6 | using Gemipedia.Models; 7 | 8 | namespace Gemipedia.Converter.Special; 9 | 10 | public class InfoboxParser 11 | { 12 | InfoboxItem infobox = new InfoboxItem(); 13 | Buffer buffer = new Buffer(); 14 | bool shouldSkipFirst = false; 15 | 16 | TextExtractor textExtractor = new TextExtractor 17 | { 18 | ShouldCollapseNewlines = true 19 | }; 20 | 21 | public InfoboxItem Parse(HtmlElement table) 22 | { 23 | var tableBodyRows = table.QuerySelector("tbody")?.Children ?? null; 24 | 25 | if (tableBodyRows == null) 26 | { 27 | return null; 28 | } 29 | 30 | infobox.CustomTitle = ExtractTitle(table, tableBodyRows); 31 | ParseTableRows(tableBodyRows); 32 | 33 | return infobox; 34 | } 35 | 36 | public static bool IsInfobox(HtmlElement table) 37 | //fr wikipedia uses infobox_v2 38 | => table.ClassName?.Contains("infobox") ?? false; 39 | 40 | //===================================================================== 41 | 42 | private void AddHeader(IElement row) 43 | { 44 | buffer.Reset(); 45 | textExtractor.Extract(row); 46 | buffer.EnsureAtLineStart(); 47 | buffer.AppendLine($"### {textExtractor.Content}"); 48 | buffer.Links.Add(textExtractor); 49 | infobox.AddItem(new ContentItem(buffer)); 50 | } 51 | 52 | private void AddMedia(IElement row) 53 | { 54 | //check for a montage 55 | var multi = row.QuerySelector("div.thumb.tmulti"); 56 | if (multi != null) 57 | { 58 | infobox.AddItems(MediaParser.ConvertMontage(multi)); 59 | return; 60 | } 61 | 62 | var imgContainer = row.Children[0].Children[0]; 63 | var captionContainer = (row.Children[0].ChildElementCount >= 2) ? row.Children[0].Children[1] : null; 64 | 65 | infobox.AddItem(MediaParser.ConvertMedia(imgContainer, captionContainer)); 66 | } 67 | 68 | private void AddTwoCells(IElement left, IElement right) 69 | { 70 | if (left.NodeName.ToLower() == "th") 71 | { 72 | AddNameValue(left, right); 73 | } 74 | else if (IsComparingRow(left, right)) 75 | { 76 | AddTwoRichCells(left, right); 77 | } 78 | else 79 | { 80 | AddNameValue(left, right); 81 | } 82 | } 83 | 84 | private void AddRichCell(string label, RichContent content) 85 | { 86 | if (content.NoContent) 87 | { 88 | //just a placeholder label 89 | infobox.AddItem(new ContentItem 90 | { 91 | Content = label + ":\n" 92 | }); 93 | return; 94 | } 95 | 96 | //Shoudl the label and content be on the same line or not? 97 | var labelSuffix = content.IsSingleLine ? ": " : ":\n"; 98 | infobox.AddItem(new ContentItem 99 | { 100 | Links = textExtractor.Links, 101 | Content = label + labelSuffix 102 | }); 103 | 104 | infobox.AddItems(content.Items); 105 | } 106 | 107 | private void AddTwoRichCells(IElement leftCell, IElement rightCell) 108 | { 109 | var content = ParseRichCell(leftCell); 110 | AddRichCell("[Left Column]", content); 111 | 112 | content = ParseRichCell(rightCell); 113 | AddRichCell("[Right Column]", content); 114 | } 115 | 116 | private void AddNameValue(IElement nameCell, IElement valueCell) 117 | { 118 | //step 1, extract out the name 119 | textExtractor.Extract(nameCell); 120 | var label = CleanLabel(textExtractor.Content); 121 | 122 | var valueContent = ParseRichCell(valueCell); 123 | 124 | if (label.Length > 0) 125 | { 126 | if (valueContent.NoContent) 127 | { 128 | //just a placeholder label 129 | infobox.AddItem(new ContentItem 130 | { 131 | Links = textExtractor.Links, 132 | Content = label + ":" + "\n" 133 | }); 134 | return; 135 | } 136 | 137 | //Should the label and content be on the same line or not? 138 | var labelSuffix = valueContent.IsSingleLine ? ": " : ":\n"; 139 | infobox.AddItem(new ContentItem 140 | { 141 | Links = textExtractor.Links, 142 | Content = label + labelSuffix 143 | }); 144 | } 145 | 146 | infobox.AddItems(valueContent.Items); 147 | } 148 | 149 | private RichContent ParseRichCell(IElement cell) 150 | { 151 | var parser = new HtmlParser 152 | { 153 | ConvertListItems = false, 154 | }; 155 | parser.Parse(cell); 156 | 157 | var items = parser.GetItems(); 158 | var contentItems = items.Where(x => x is ContentItem).Select(x => x as ContentItem).ToList(); 159 | 160 | if (contentItems.Count == 0) 161 | { 162 | // no interesting content 163 | return new RichContent 164 | { 165 | Items = items, 166 | NoContent = true 167 | }; 168 | } 169 | if (parser.HasGeminiFormatting || contentItems.Count > 1) 170 | { 171 | return new RichContent 172 | { 173 | Items = EnsureNewline(items) 174 | }; 175 | } 176 | 177 | //lets see if it has multiple lines or not 178 | var content = contentItems[0].Content.Trim(); 179 | 180 | if (!content.Contains('\n')) 181 | { 182 | return new RichContent 183 | { 184 | IsSingleLine = true, 185 | Items = EnsureNewline(items) 186 | }; 187 | } 188 | else 189 | { 190 | //convert it to lines 191 | buffer.Reset(); 192 | buffer.Links.Add(contentItems[0]); 193 | //convert to a list 194 | foreach (var line in content.Split("\n", StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) 195 | { 196 | buffer.AppendLine($"* {line}"); 197 | } 198 | 199 | //remove all the content items (only 1) since since we reformatted that into a list 200 | items.RemoveAll(x => x is ContentItem); 201 | items.Add(new ContentItem(buffer)); 202 | 203 | return new RichContent 204 | { 205 | Items = items 206 | }; 207 | } 208 | } 209 | 210 | //ensure that, when a sequence of items is rendered, the last item will make a new line 211 | private List EnsureNewline(List items) 212 | { 213 | var content = items.Where(x => x is ContentItem).Select(x => x as ContentItem); 214 | if (content.Count() > 0 && !content.Last().Content.EndsWith("\n")) 215 | { 216 | content.Last().Content += "\n"; 217 | } 218 | return items; 219 | } 220 | 221 | private void AddWideValue(IElement valueCell) 222 | { 223 | //step 1, extract out the name 224 | var parser = new HtmlParser 225 | { 226 | ConvertListItems = false 227 | }; 228 | parser.Parse(valueCell); 229 | infobox.AddItems(EnsureNewline(parser.GetItems())); 230 | } 231 | 232 | private string CleanLabel(string text) 233 | { 234 | text = text.Trim(); 235 | 236 | //some labels attempt to look like a bulleted list, even though 237 | //each entry is a different row, and use a "•" character. remove it 238 | if (text.StartsWith("•") && text.Length > 1) 239 | { 240 | text = text.Substring(1); 241 | } 242 | //some label, especially those that use a TD for the name cell instead of a TH, 243 | //will include a ":" already in the text, if so remove it 244 | if (text.Length >= 2 && text.EndsWith(':')) 245 | { 246 | text = text.Substring(0, text.Length - 1); 247 | } 248 | if (text.Length > 1) 249 | { 250 | //capitalize the first letter 251 | text = text.Substring(0, 1).ToUpper() + text.Substring(1); 252 | } 253 | else if (text.Length == 1) 254 | { 255 | text = text.ToUpper(); 256 | } 257 | return text; 258 | } 259 | 260 | private string ExtractTitle(HtmlElement table, IHtmlCollection rows) 261 | { 262 | //first check for a caption 263 | var caption = table.QuerySelector("caption")?.TextContent.Trim() ?? null; 264 | if (!String.IsNullOrEmpty(caption)) 265 | { 266 | return caption; 267 | } 268 | 269 | if (rows?.Length >= 1 && rows[0].ChildElementCount == 1) 270 | { 271 | textExtractor.Extract(rows[0]); 272 | 273 | var title = textExtractor.Content.Trim(); 274 | if (title.Length > 0) 275 | { 276 | buffer.Links.Add(textExtractor); 277 | shouldSkipFirst = true; 278 | return title; 279 | } 280 | } 281 | return ""; 282 | } 283 | 284 | //are these 2 cells being compared to each other? 285 | private bool IsComparingRow(IElement left, IElement right) 286 | => left.HasAttribute("style") && left.GetAttribute("style").Replace(" ", "").Contains("border-right:1px"); 287 | 288 | private bool IsHeader(IElement row, int index) 289 | => (row.ChildElementCount == 1) && row.Children[0].NodeName == "TH"; 290 | 291 | private bool IsMedia(IElement row) 292 | => (row.ChildElementCount == 1) && 293 | (row.Children[0].ChildElementCount >= 1) && 294 | (row.Children[0].Children?[0].QuerySelector("img") != null); 295 | 296 | private bool IsNestedTable(IElement row) 297 | => row.QuerySelector("td table tbody") != null; 298 | 299 | private void ParseNestedTable(IElement row) 300 | => ParseTableRows(row.QuerySelector("td table tbody")?.Children ?? null, true); 301 | 302 | private void ParseRow(IElement row, int index, bool isNestedTable) 303 | { 304 | if (!isNestedTable && shouldSkipFirst && index == 0) 305 | { 306 | return; 307 | } 308 | if (row.NodeName != "TR") 309 | { 310 | throw new ApplicationException("Non row in info box"); 311 | } 312 | 313 | if (row.ChildElementCount == 0) 314 | { 315 | return; 316 | } 317 | 318 | if (IsHeader(row, index)) 319 | { 320 | AddHeader(row); 321 | } 322 | else if (IsNestedTable(row)) 323 | { 324 | ParseNestedTable(row); 325 | } 326 | else if (IsMedia(row)) 327 | { 328 | AddMedia(row); 329 | } 330 | else if (row.Children.Length == 1) 331 | { 332 | AddWideValue(row.Children[0]); 333 | } 334 | else if (row.Children.Length == 2) 335 | { 336 | AddTwoCells(row.Children[0], row.Children[1]); 337 | } 338 | } 339 | 340 | private void ParseTableRows(IHtmlCollection rows, bool isNestedTable = false) 341 | { 342 | if (rows == null) 343 | { 344 | return; 345 | } 346 | for (int i = 0; i < rows.Length; i++) 347 | { 348 | ParseRow(rows[i], i, isNestedTable); 349 | } 350 | } 351 | 352 | private class RichContent 353 | { 354 | public bool NoContent = false; 355 | public List Items; 356 | public bool IsSingleLine = false; 357 | } 358 | 359 | } 360 | -------------------------------------------------------------------------------- /Gemipedia.Cgi/RouteHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Web; 4 | using Gemini.Cgi; 5 | using Gemipedia.API; 6 | using Gemipedia.API.Models; 7 | using Gemipedia.Converter; 8 | using Gemipedia.Converter.Special; 9 | using Gemipedia.Media; 10 | using Gemipedia.Models; 11 | using Gemipedia.Renderer; 12 | 13 | namespace Gemipedia.Cgi; 14 | 15 | public static class RouteHandler 16 | { 17 | #region Routes 18 | 19 | public static void Search(CgiWrapper cgi) 20 | { 21 | if (!cgi.HasQuery) 22 | { 23 | cgi.Input("Search for an Article"); 24 | return; 25 | } 26 | 27 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 28 | var outWriter = new CountingTextWriter(cgi.Writer); 29 | 30 | outWriter.WriteLine($"Articles containing '{cgi.SantiziedQuery}'."); 31 | var searchResults = client.Search(cgi.SantiziedQuery); 32 | if (searchResults.Count == 0) 33 | { 34 | //TODO use "suggest API here 35 | outWriter.WriteLine("No results found."); 36 | return; 37 | } 38 | else 39 | { 40 | int counter = 0; 41 | foreach (var result in searchResults) 42 | { 43 | counter++; 44 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl(result.Title)} {counter}. {result.Title}"); 45 | if (!string.IsNullOrEmpty(result.ThumbnailUrl)) 46 | { 47 | outWriter.WriteLine($"=> {RouteOptions.MediaProxyUrl(result.ThumbnailUrl)} Featured Image: {result.Title}"); 48 | } 49 | outWriter.WriteLine($">{result.SummaryText}"); 50 | outWriter.WriteLine(); 51 | } 52 | } 53 | RenderFooter(outWriter, client.DownloadTimeMs); 54 | } 55 | 56 | public static void SearchLatLon(CgiWrapper cgi) 57 | { 58 | if (!cgi.HasQuery) 59 | { 60 | cgi.Redirect(RouteOptions.WelcomeUrl()); 61 | return; 62 | } 63 | 64 | var query = HttpUtility.ParseQueryString(cgi.Query); 65 | 66 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 67 | var outWriter = new CountingTextWriter(cgi.Writer); 68 | 69 | var lat = Convert.ToDouble(query["lat"] ?? "0"); 70 | var lon = Convert.ToDouble(query["lon"] ?? "0"); 71 | var title = query["title"] ?? ""; 72 | 73 | outWriter.WriteLine($"Articles near '{title}'"); 74 | 75 | var searchResults = client.GeoSearch(lat, lon); 76 | if (searchResults.Count == 0) 77 | { 78 | outWriter.WriteLine("No results found."); 79 | return; 80 | } 81 | else 82 | { 83 | int counter = 0; 84 | foreach (var result in searchResults) 85 | { 86 | counter++; 87 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl(result.Title)} {counter}. {result.Title}"); 88 | outWriter.WriteLine($"* Distance away: {result.Distance} m"); 89 | outWriter.WriteLine(); 90 | } 91 | } 92 | RenderFooter(outWriter, client.DownloadTimeMs); 93 | } 94 | 95 | public static void SelectLanguage(CgiWrapper cgi) 96 | { 97 | cgi.Success(); 98 | var outWriter = new CountingTextWriter(cgi.Writer); 99 | 100 | outWriter.WriteLine("# Gemipedia"); 101 | outWriter.WriteLine("Gemipedia supports all of the languages that have a Wikipedia. The Gemipedia interface will be in English, and all article content, references, images, and featured content will be in the choosen language. You can select a language below, or use a specific language by providing a 2 letter ISO 3166 code"); 102 | //force englist for this list 103 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("List of Wikipedias", "en")} List of available Wikipedias"); 104 | outWriter.WriteLine(""); 105 | outWriter.WriteLine($"Current Language: {UserOptions.LangaugeName}"); 106 | 107 | foreach (var lang in LanguageUtils.CommonLanguages) 108 | { 109 | outWriter.WriteLine($"=> {RouteOptions.WelcomeUrl(lang)} Use {LanguageUtils.GetName(lang)}"); 110 | } 111 | outWriter.WriteLine($"=> {RouteOptions.SetLanguageUrl()} Set specific language"); 112 | 113 | RenderFooter(outWriter); 114 | } 115 | 116 | public static void SetLanguage(CgiWrapper cgi) 117 | { 118 | if (cgi.HasQuery) 119 | { 120 | //see if its valid 121 | if (LanguageUtils.IsValidCode(cgi.Query)) 122 | { 123 | cgi.Redirect(RouteOptions.WelcomeUrl(cgi.Query)); 124 | return; 125 | } 126 | } 127 | cgi.Input("Enter 2 letter ISO 3166 language code to use"); 128 | return; 129 | } 130 | 131 | public static void Welcome(CgiWrapper cgi) 132 | { 133 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 134 | var outWriter = new CountingTextWriter(cgi.Writer); 135 | 136 | outWriter.WriteLine("# Gemipedia"); 137 | outWriter.WriteLine("Welcome to Gemipedia: A Gemini frontend to Wikipedia, focused on providing a 1st class reading experience."); 138 | outWriter.WriteLine(""); 139 | outWriter.WriteLine($"=> {RouteOptions.SelectLanguageUrl()} Using {UserOptions.LangaugeName} Wikipedia. Change Language?"); 140 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl()} Go to Article"); 141 | outWriter.WriteLine($"=> {RouteOptions.SearchUrl()} Search for Articles containing a phrase"); 142 | outWriter.WriteLine($"=> {RouteOptions.RandomArticleUrl()} 🎲 Random Article"); 143 | outWriter.WriteLine(""); 144 | 145 | outWriter.WriteLine("## Featured Content"); 146 | if (UserOptions.WikipediaVersion == "en") 147 | { 148 | outWriter.WriteLine($"=> {RouteOptions.FeaturedContent()} Featured Article and 25 most popular articles (updated daily)"); 149 | } 150 | else 151 | { 152 | outWriter.WriteLine($"=> {RouteOptions.FeaturedContent()} Featured Article and 25 most popular articles on {UserOptions.LangaugeName} Wikipedia (updated daily)"); 153 | } 154 | 155 | outWriter.WriteLine("## Article Examples:"); 156 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("History of Apple Inc.")} History of Apple Inc."); 157 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("Blue Poles")} Blue Poles"); 158 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("Gemini (protocol)")} Gemini (protocol)"); 159 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("Computer network")} Computer network"); 160 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("Interface Message Processor")} Interface Message Processor"); 161 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl("ALOHAnet")} ALOHAnet"); 162 | } 163 | 164 | public static void ViewFeatured(CgiWrapper cgi) 165 | { 166 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 167 | var outWriter = new CountingTextWriter(cgi.Writer); 168 | 169 | outWriter.WriteLine($"# Gemipedia Featured Content {DateTime.Now.ToString("yyyy-MM-dd")}"); 170 | outWriter.WriteLine("Compelling content pulled every day from the from page of Wikipedia"); 171 | 172 | outWriter.WriteLine("## Daily Featured Article"); 173 | 174 | var featured = client.GetFeaturedContent(); 175 | 176 | if (featured.FeaturedArticle != null) 177 | { 178 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl(featured.FeaturedArticle.Title)} {featured.FeaturedArticle.Title}"); 179 | if (!string.IsNullOrEmpty(featured.FeaturedArticle.ThumbnailUrl)) 180 | { 181 | outWriter.WriteLine($"=> {RouteOptions.MediaProxyUrl(featured.FeaturedArticle.ThumbnailUrl)} Featured Image: {featured.FeaturedArticle.Title}"); 182 | } 183 | outWriter.WriteLine($">{featured.FeaturedArticle.Excerpt}"); 184 | outWriter.WriteLine(); 185 | } 186 | else 187 | { 188 | outWriter.WriteLine("(Featured article was unavailable)"); 189 | } 190 | 191 | outWriter.WriteLine("### 25 most viewed articles on Wikipedia today"); 192 | 193 | if (featured.PopularArticles.Count > 0) 194 | { 195 | int counter = 0; 196 | foreach (var article in featured.PopularArticles) 197 | { 198 | counter++; 199 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl(article.Title)} {counter}. {article.Title}"); 200 | if (!string.IsNullOrEmpty(article.ThumbnailUrl)) 201 | { 202 | outWriter.WriteLine($"=> {RouteOptions.MediaProxyUrl(article.ThumbnailUrl)} Featured Image: {article.Title}"); 203 | } 204 | outWriter.WriteLine($">{article.SummaryText}"); 205 | outWriter.WriteLine(); 206 | } 207 | } 208 | else 209 | { 210 | outWriter.WriteLine("(Daily popular articles were unavailable)"); 211 | } 212 | RenderFooter(outWriter, client.DownloadTimeMs); 213 | } 214 | 215 | public static void ViewGeo(CgiWrapper cgi) 216 | { 217 | GeohackParser geoparser = null; 218 | geoparser = new GeohackParser(cgi.Query); 219 | if (geoparser.IsValid) 220 | { 221 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 222 | var outWriter = new CountingTextWriter(cgi.Writer); 223 | 224 | var renderer = new GeoRenderer(); 225 | renderer.RenderGeo(geoparser, outWriter); 226 | RenderFooter(outWriter); 227 | return; 228 | } 229 | 230 | cgi.BadRequest("Invalid geo information"); 231 | return; 232 | } 233 | 234 | public static void ViewRandomArticle(CgiWrapper cgi) 235 | { 236 | string title = client.GetRandomArticleTitle(); 237 | cgi.Redirect(RouteOptions.ArticleUrl(title)); 238 | } 239 | 240 | public static void ViewArticle(CgiWrapper cgi) 241 | { 242 | if (!cgi.HasQuery) 243 | { 244 | cgi.Input("Article Name? (doesn't need to be exact)"); 245 | return; 246 | } 247 | 248 | var outWriter = new CountingTextWriter(cgi.Writer); 249 | 250 | Article article = GetArticle(cgi); 251 | try 252 | { 253 | if (article != null) 254 | { 255 | if (RedirectParser.IsArticleRedirect(article.HtmlText)) 256 | { 257 | cgi.Redirect(RouteOptions.ArticleUrl(RedirectParser.GetRedirectTitle(article.HtmlText))); 258 | return; 259 | } 260 | 261 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 262 | 263 | ParsedPage parsedPage = converter.Convert(article.Title, article.HtmlText); 264 | RenderArticle(parsedPage, outWriter); 265 | } 266 | else 267 | { 268 | //redirect to search... 269 | cgi.Redirect(RouteOptions.SearchUrl(cgi.Query)); 270 | return; 271 | } 272 | } 273 | catch (Exception ex) 274 | { 275 | outWriter.WriteLine("Boom! Hit Exception!"); 276 | outWriter.WriteLine("```"); 277 | outWriter.WriteLine(ex.Message); 278 | outWriter.WriteLine(ex.Source); 279 | outWriter.WriteLine(ex.StackTrace); 280 | outWriter.WriteLine("```"); 281 | } 282 | RenderFooter(outWriter, client.DownloadTimeMs, converter.ConvertTimeMs); 283 | } 284 | 285 | public static void ViewImages(CgiWrapper cgi) 286 | { 287 | var outWriter = new CountingTextWriter(cgi.Writer); 288 | 289 | Article article = GetArticle(cgi); 290 | 291 | if (article != null) 292 | { 293 | if (RedirectParser.IsArticleRedirect(article.HtmlText)) 294 | { 295 | cgi.Redirect(RouteOptions.ImageGalleryUrl(RedirectParser.GetRedirectTitle(article.HtmlText))); 296 | return; 297 | } 298 | 299 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 300 | ParsedPage page = converter.Convert(article.Title, article.HtmlText); 301 | var gallery = new GalleryRenderer(); 302 | gallery.RenderGallery(page, outWriter); 303 | } 304 | else 305 | { 306 | cgi.Success(); 307 | outWriter.WriteLine("We could not access that article"); 308 | } 309 | RenderFooter(outWriter, client.DownloadTimeMs, converter.ConvertTimeMs); 310 | } 311 | 312 | public static void ViewOtherLanguages(CgiWrapper cgi) 313 | { 314 | var title = cgi.SantiziedQuery; 315 | var otherLangs = client.GetOtherLanguages(title); 316 | 317 | cgi.Success(); 318 | var outWriter = new CountingTextWriter(cgi.Writer); 319 | 320 | outWriter.WriteLine($"# Other Languages"); 321 | outWriter.WriteLine($"The article '{title}' is available in {otherLangs.Count} other languages"); 322 | if (otherLangs.Count == 0) 323 | { 324 | outWriter.WriteLine("No languages found."); 325 | return; 326 | } 327 | else 328 | { 329 | foreach (var lang in otherLangs) 330 | { 331 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl(lang.Title, lang.LanguageCode)} {LanguageUtils.GetName(lang.LanguageCode)} - {lang.Title}"); 332 | } 333 | } 334 | RenderFooter(outWriter, client.DownloadTimeMs); 335 | } 336 | 337 | public static void ViewRefs(CgiWrapper cgi) 338 | { 339 | var query = HttpUtility.ParseQueryString(cgi.RawQuery); 340 | var title = query["name"] ?? ""; 341 | var section = Convert.ToInt32(query["section"] ?? "-1"); 342 | 343 | Article article = GetArticle(title); 344 | var outWriter = new CountingTextWriter(cgi.Writer); 345 | 346 | if (article != null) 347 | { 348 | cgi.Success($"text/gemini;lang={UserOptions.WikipediaVersion}"); 349 | ParsedPage parsedPage = converter.Convert(article.Title, article.HtmlText); 350 | var refs = new ReferencesRenderer(); 351 | refs.RenderReferences(parsedPage, outWriter, section); 352 | } 353 | else 354 | { 355 | cgi.Success(); 356 | outWriter.WriteLine("We could not access that article"); 357 | } 358 | RenderFooter(outWriter, client.DownloadTimeMs, converter.ConvertTimeMs); 359 | } 360 | 361 | public static void ProxyMedia(CgiWrapper cgi) 362 | { 363 | var url = cgi.Query; 364 | if (!IsSafeMediaUrl(url)) 365 | { 366 | cgi.Missing("cannot fetch media"); 367 | return; 368 | } 369 | MediaContent media = MediaProcessor.ProcessImage(client.GetMedia(url)); 370 | cgi.Success(media.MimeType); 371 | cgi.Out.Write(media.Data); 372 | } 373 | 374 | 375 | #endregion 376 | 377 | static WikipediaApiClient client = new WikipediaApiClient(UserOptions.WikipediaVersion); 378 | static WikiHtmlConverter converter = new WikiHtmlConverter(); 379 | 380 | private static Article GetArticle(CgiWrapper cgi) 381 | => GetArticle(cgi.SantiziedQuery); 382 | 383 | private static Article GetArticle(string title) 384 | => client.GetArticle(title); 385 | 386 | private static void RenderArticle(ParsedPage page, TextWriter output) 387 | { 388 | var renderer = new ArticleRenderer(); 389 | renderer.RenderArticle(page, output); 390 | } 391 | 392 | static bool IsSafeMediaUrl(string url) 393 | { 394 | try 395 | { 396 | var host = (new Uri(url)).Host; ; 397 | return host == "wikimedia.org" || host.EndsWith(".wikimedia.org"); 398 | } 399 | catch (Exception) 400 | { } 401 | 402 | return false; 403 | } 404 | 405 | static void RenderFooter(CountingTextWriter outWriter, long? downloadTimeMs = null, long? convertTimeMs = null) 406 | { 407 | outWriter.WriteLine(); 408 | outWriter.WriteLine("--"); 409 | outWriter.WriteLine($"=> {RouteOptions.WelcomeUrl()} Gemipedia Home"); 410 | outWriter.WriteLine($"=> {RouteOptions.ArticleUrl()} Go to Article"); 411 | outWriter.WriteLine($"=> {RouteOptions.SelectLanguageUrl()} Using {UserOptions.LangaugeName} Wikipedia. Change Language?"); 412 | outWriter.WriteLine("--"); 413 | 414 | 415 | if (downloadTimeMs != null) 416 | { 417 | int outputSize = Convert.ToInt32(outWriter.ByteCount); 418 | outWriter.WriteLine($"Size: {ReadableFileSize(outputSize)}. {Savings(outputSize, client.DownloadSize)} smaller than original: {ReadableFileSize(client.DownloadSize)} 🤮"); 419 | } 420 | 421 | if (downloadTimeMs != null || convertTimeMs != null) 422 | { 423 | if (downloadTimeMs != null) 424 | { 425 | outWriter.Write($"Fetched: {downloadTimeMs} ms. "); 426 | } 427 | if (convertTimeMs != null) 428 | { 429 | outWriter.Write($"Converted: {convertTimeMs} ms. "); 430 | } 431 | outWriter.WriteLine("🐇"); 432 | } 433 | outWriter.WriteLine("=> mailto:acidus@gemi.dev Made with 📚 and ❤️ by Acidus"); 434 | outWriter.WriteLine("All Wikipedia content is licensed under CC BY-SA 3.0"); 435 | } 436 | 437 | 438 | private static string Savings(int newSize, int originalSize) 439 | => string.Format("{0:0.00}%", (1.0d - (Convert.ToDouble(newSize) / Convert.ToDouble(originalSize))) * 100.0d); 440 | 441 | private static string ReadableFileSize(double size, int unit = 0) 442 | { 443 | string[] units = { "B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" }; 444 | 445 | while (size >= 1024) 446 | { 447 | size /= 1024; 448 | ++unit; 449 | } 450 | 451 | return string.Format("{0:0.0#} {1}", size, units[unit]); 452 | } 453 | } -------------------------------------------------------------------------------- /Gemipedia/Converter/HtmlParser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using AngleSharp.Dom; 5 | using AngleSharp.Html.Dom; 6 | using Gemipedia.Converter.Filter; 7 | using Gemipedia.Converter.Special; 8 | using Gemipedia.Models; 9 | 10 | namespace Gemipedia.Converter; 11 | 12 | /// 13 | /// parses HTML nodes into Section Items 14 | /// 15 | public class HtmlParser 16 | { 17 | private static readonly string[] blockElements = new string[] { "address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video" }; 18 | 19 | private List items = new List(); 20 | 21 | private int listDepth = 0; 22 | 23 | Buffer buffer = new Buffer(); 24 | 25 | private bool inPreformatted = false; 26 | private bool inMathformula = false; 27 | 28 | public bool HasGeminiFormatting { get; private set; } = false; 29 | 30 | /// 31 | /// should we try and convert list items to links? 32 | /// 33 | public bool ConvertListItems { get; set; } = true; 34 | 35 | public void Parse(INode current) 36 | { 37 | ParseHelper(current); 38 | } 39 | 40 | public List GetItems() 41 | { 42 | FlushBuffer(); 43 | return items; 44 | } 45 | 46 | private void AddItem(SectionItem item) 47 | { 48 | if(item != null) 49 | { 50 | FlushBuffer(); 51 | items.Add(item); 52 | } 53 | } 54 | 55 | private void AddItems(IEnumerable newItems) 56 | { 57 | if (newItems?.Count() > 0) 58 | { 59 | FlushBuffer(); 60 | items.AddRange(newItems); 61 | } 62 | } 63 | 64 | private void FlushBuffer() 65 | { 66 | if (buffer.HasContent) 67 | { 68 | items.Add(new ContentItem(buffer)); 69 | buffer.Reset(); 70 | } 71 | } 72 | 73 | private void ParseHelper(INode current) 74 | { 75 | switch (current.NodeType) 76 | { 77 | case NodeType.Text: 78 | ProcessTextNode(current); 79 | break; 80 | 81 | case NodeType.Element: 82 | ProcessHtmlElement(current as HtmlElement); 83 | break; 84 | } 85 | } 86 | 87 | private void ParseChildern(INode node) 88 | { 89 | foreach (var child in node.ChildNodes) 90 | { 91 | ParseHelper(child); 92 | } 93 | } 94 | 95 | private void ProcessTextNode(INode textNode) 96 | { 97 | if (inPreformatted) 98 | { 99 | buffer.Append(textNode.TextContent); 100 | } 101 | else 102 | { 103 | //if its not only whitespace add it. 104 | if (textNode.TextContent.Trim().Length > 0) 105 | { 106 | if (buffer.AtLineStart) 107 | { 108 | buffer.Append(textNode.TextContent.TrimStart()); 109 | } 110 | else 111 | { 112 | buffer.Append(textNode.TextContent); 113 | } 114 | } 115 | //if its whitepsace, but doesn't have a newline 116 | else if (!textNode.TextContent.Contains('\n')) 117 | { 118 | if (buffer.AtLineStart) 119 | { 120 | buffer.Append(textNode.TextContent.TrimStart()); 121 | } 122 | else 123 | { 124 | buffer.Append(textNode.TextContent); 125 | } 126 | } 127 | } 128 | } 129 | 130 | private void ProcessHtmlElement(HtmlElement element) 131 | { 132 | var nodeName = element?.NodeName.ToLower(); 133 | 134 | if (!ShouldProcessElement(element, nodeName)) 135 | { 136 | return; 137 | } 138 | 139 | switch (nodeName) 140 | { 141 | case "a": 142 | ProcessAnchor(element); 143 | break; 144 | 145 | case "blockquote": 146 | HasGeminiFormatting = true; 147 | buffer.EnsureAtLineStart(); 148 | buffer.InBlockquote = true; 149 | ParseChildern(element); 150 | buffer.InBlockquote = false; 151 | break; 152 | 153 | case "br": 154 | buffer.AppendLine(); 155 | break; 156 | 157 | case "dd": 158 | HasGeminiFormatting = true; 159 | buffer.EnsureAtLineStart(); 160 | buffer.SetLineStart("* "); 161 | ParseChildern(element); 162 | buffer.EnsureAtLineStart(); 163 | break; 164 | 165 | case "div": 166 | ProcessDiv(element); 167 | break; 168 | 169 | case "dt": 170 | buffer.EnsureAtLineStart(); 171 | ParseChildern(element); 172 | if (!buffer.AtLineStart) 173 | { 174 | buffer.AppendLine(":"); 175 | } 176 | break; 177 | 178 | case "figure": 179 | ProcessFigure(element); 180 | break; 181 | 182 | case "i": 183 | if (ShouldUseItalics(element)) 184 | { 185 | buffer.Append("\""); 186 | ParseChildern(element); 187 | buffer.Append("\""); 188 | } 189 | else 190 | { 191 | ParseChildern(element); 192 | } 193 | break; 194 | 195 | case "li": 196 | ProcessLi(element); 197 | break; 198 | 199 | case "ol": 200 | ProcessList(element); 201 | break; 202 | 203 | case "p": 204 | buffer.EnsureAtLineStart(); 205 | int size = buffer.Content.Length; 206 | ParseChildern(element); 207 | //make sure the paragraph ends with a new line 208 | buffer.EnsureAtLineStart(); 209 | if (buffer.Content.Length > size) 210 | { 211 | //add another blank line if this paragraph had content 212 | buffer.AppendLine(); 213 | } 214 | break; 215 | 216 | case "pre": 217 | HasGeminiFormatting = true; 218 | buffer.EnsureAtLineStart(); 219 | buffer.AppendLine("```"); 220 | inPreformatted = true; 221 | ParseChildern(element); 222 | buffer.EnsureAtLineStart(); 223 | inPreformatted = false; 224 | buffer.AppendLine("```"); 225 | break; 226 | 227 | case "sub": 228 | ProcessSub(element); 229 | break; 230 | 231 | case "sup": 232 | ProcessSup(element); 233 | break; 234 | 235 | case "table": 236 | ProcessTable(element); 237 | break; 238 | 239 | case "u": 240 | buffer.Append("_"); 241 | ParseChildern(element); 242 | buffer.Append("_"); 243 | break; 244 | 245 | case "ul": 246 | ProcessUl(element); 247 | break; 248 | 249 | default: 250 | ProcessGenericTag(element); 251 | break; 252 | } 253 | } 254 | 255 | public static bool ShouldProcessElement(HtmlElement element,string normalizedTagName) 256 | { 257 | //A MathElement is of type element, but it not an HtmlElement 258 | //so it will be null 259 | if (element == null) 260 | { 261 | return false; 262 | } 263 | 264 | if(element.ClassName?.Contains("navigation") ?? false) 265 | { 266 | return false; 267 | } 268 | 269 | //see if we are explicitly filtering 270 | if (!DomFilter.Global.IsElementAllowed(element, normalizedTagName)) 271 | { 272 | return false; 273 | } 274 | 275 | //is it visible? 276 | if (IsInvisible(element)) 277 | { 278 | return false; 279 | } 280 | 281 | return true; 282 | } 283 | 284 | //should we use apply italic formatting around this element? 285 | private bool ShouldUseItalics(HtmlElement element) 286 | { 287 | //if we are already inside a math formula, don't do italics 288 | if(inMathformula) 289 | { 290 | return false; 291 | } 292 | var siblingTag = element.NextElementSibling?.NodeName?.ToLower() ?? ""; 293 | if(siblingTag == "sub" || siblingTag == "sup") 294 | { 295 | return false; 296 | } 297 | return true; 298 | } 299 | 300 | private static bool IsInvisible(HtmlElement element) 301 | => element.GetAttribute("style")?.Contains("display:none") ?? false; 302 | 303 | private void ProcessAnchor(HtmlElement anchor) 304 | { 305 | if (GeoParser.IsGeoLink(anchor)) 306 | { 307 | AddItem(GeoParser.ParseGeo(anchor)); 308 | } 309 | else if (IsWikiDataLink(anchor)) 310 | { 311 | //we don't want to process the children if this links to Wikidata 312 | return; 313 | } 314 | else 315 | { 316 | buffer.Links.Add(anchor); 317 | } 318 | ParseChildern(anchor); 319 | } 320 | 321 | private void ProcessDiv(HtmlElement div) 322 | { 323 | // Is this a legacy media div, that is also not a location map 324 | // https://www.mediawiki.org/wiki/Parsoid/Parser_Unification/Media_structure/FAQ 325 | // is it a media div? 326 | if (div.ClassList.Contains("thumb") && !div.ClassList.Contains("locmap")) 327 | { 328 | if (div.ClassList.Contains("tmulti")) 329 | { 330 | AddItems(MediaParser.ConvertMontage(div, div.QuerySelector(".thumbcaption"))); 331 | return; 332 | } 333 | AddItem(MediaParser.ConvertMedia(div, div.QuerySelector(".thumbcaption"))); 334 | return; 335 | } 336 | 337 | //a navigation note? 338 | if (div.GetAttribute("role") == "note" && div.ClassList.Contains("navigation-not-searchable")) 339 | { 340 | AddItem(NavigationParser.ConvertNavigationNote(div)); 341 | return; 342 | } 343 | 344 | if (div.ClassList.Contains("timeline-wrapper")) 345 | { 346 | AddItem(MediaParser.ConvertTimeline(div)); 347 | return; 348 | } 349 | 350 | //fall through to generic handling 351 | ProcessGenericTag(div); 352 | } 353 | 354 | private void ProcessFigure(HtmlElement figure) 355 | { 356 | //Support the new markup output for images 357 | //see: https://www.mediawiki.org/wiki/Parsoid/Parser_Unification/Media_structure/FAQ 358 | if (figure.GetAttribute("typeof") == "mw:File/Thumb") 359 | { 360 | AddItem(MediaParser.ConvertMedia(figure, figure.QuerySelector("figcaption"))); 361 | } 362 | return; 363 | } 364 | 365 | private void ProcessGenericTag(HtmlElement element) 366 | { 367 | //is this a math element? 368 | if(element.ClassList.Contains("mwe-math-element")) 369 | { 370 | HasGeminiFormatting = true; 371 | //math elements have to be displayed at the start of the like 372 | buffer.EnsureAtLineStart(); 373 | buffer.AppendLine(MathConverter.ConvertMath(element)); 374 | return; 375 | } 376 | 377 | if(element.ClassList.Contains("texhtml") && !inMathformula) 378 | { 379 | inMathformula = true; 380 | buffer.Append("\""); 381 | ParseChildern(element); 382 | buffer.Append("\""); 383 | inMathformula = false; 384 | return; 385 | } 386 | 387 | if (ShouldDisplayAsBlock(element)) 388 | { 389 | buffer.EnsureAtLineStart(); 390 | ParseChildern(element); 391 | buffer.EnsureAtLineStart(); 392 | } 393 | else 394 | { 395 | ParseChildern(element); 396 | } 397 | } 398 | 399 | private void ProcessLi(HtmlElement li) 400 | { 401 | if (TryConvertingToLink(li)) 402 | { 403 | return; 404 | } 405 | 406 | if (listDepth == 1) 407 | { 408 | HasGeminiFormatting = true; 409 | buffer.EnsureAtLineStart(); 410 | buffer.SetLineStart("* "); 411 | ParseChildern(li); 412 | buffer.EnsureAtLineStart(); 413 | } 414 | else 415 | { 416 | HasGeminiFormatting = true; 417 | buffer.EnsureAtLineStart(); 418 | buffer.SetLineStart("* * "); 419 | ParseChildern(li); 420 | buffer.EnsureAtLineStart(); 421 | } 422 | } 423 | 424 | private void ProcessList(HtmlElement element) 425 | { 426 | //block element 427 | buffer.EnsureAtLineStart(); 428 | listDepth++; 429 | ParseChildern(element); 430 | listDepth--; 431 | buffer.EnsureAtLineStart(); 432 | } 433 | 434 | private void ProcessSub(HtmlElement element) 435 | { 436 | var textExtractor = new TextExtractor 437 | { 438 | ShouldCollapseNewlines = true, 439 | ShouldConvertImages = false, 440 | }; 441 | textExtractor.Extract(element); 442 | 443 | var content = textExtractor.Content.Trim(); 444 | if (content.Length > 0) { 445 | var subConverter = new SubscriptConverter(); 446 | if (subConverter.Convert(content)) 447 | { 448 | //we successfully converted everything 449 | buffer.Append(subConverter.Converted); 450 | } 451 | //couldn't convert, fall back to using ⌄ ... 452 | else if (content.Length == 1) 453 | { 454 | buffer.Append("˅"); 455 | buffer.Append(content); 456 | } 457 | else 458 | { 459 | buffer.Append("˅("); 460 | buffer.Append(content); 461 | buffer.Append(")"); 462 | } 463 | buffer.Links.Add(textExtractor); 464 | } 465 | } 466 | 467 | private void ProcessSup(HtmlElement element) 468 | { 469 | var textExtractor = new TextExtractor 470 | { 471 | ShouldCollapseNewlines = true, 472 | ShouldConvertImages = false, 473 | }; 474 | textExtractor.Extract(element); 475 | var content = textExtractor.Content.Trim(); 476 | 477 | if (content.Length > 0) 478 | { 479 | var supConverter = new SuperscriptConverter(); 480 | if (supConverter.Convert(content)) 481 | { 482 | //we successfully converted everything 483 | buffer.Append(supConverter.Converted); 484 | } 485 | //couldn't convert, fall back to using ^... 486 | else if (content.Length == 1) 487 | { 488 | buffer.Append("^"); 489 | buffer.Append(content); 490 | } 491 | else 492 | { 493 | buffer.Append("^("); 494 | buffer.Append(content); 495 | buffer.Append(")"); 496 | } 497 | buffer.Links.Add(textExtractor); 498 | } 499 | } 500 | 501 | private void ProcessTable(HtmlElement table) 502 | { 503 | if (InfoboxParser.IsInfobox(table)) 504 | { 505 | InfoboxParser parser = new InfoboxParser(); 506 | AddItem(parser.Parse(table)); 507 | return; 508 | } 509 | 510 | //is it a table just used to create a multicolumn view? 511 | if (IsMulticolumnLayoutTable(table)) 512 | { 513 | ParseMulticolmnTable(table); 514 | return; 515 | } 516 | HasGeminiFormatting = true; 517 | //treat everying like a table? 518 | AddItem(WikiTableConverter.ConvertWikiTable(table)); 519 | } 520 | 521 | private void ProcessUl(HtmlElement ul) 522 | { 523 | //gallery? 524 | if (ul.ClassList.Contains("gallery")) 525 | { 526 | AddItems(MediaParser.ConvertGallery(ul)); 527 | return; 528 | } 529 | 530 | ProcessList(ul); 531 | } 532 | 533 | public static bool ShouldDisplayAsBlock(HtmlElement element) 534 | { 535 | var nodeName = element.NodeName.ToLower(); 536 | if (!blockElements.Contains(nodeName)) 537 | { 538 | return false; 539 | } 540 | //its a block, display it as inline? 541 | return !IsInline(element); 542 | } 543 | 544 | private static bool IsInline(HtmlElement element) 545 | => element.GetAttribute("style")?.Contains("display:inline") ?? false; 546 | 547 | private bool IsMulticolumnLayoutTable(HtmlElement element) 548 | => element.GetAttribute("role") == "presentation" && 549 | element.ClassList.Contains("multicol") && 550 | element.HasChildNodes && 551 | element.Children[0].NodeName == "TBODY" && 552 | element.Children[0].HasChildNodes && 553 | element.Children[0].Children[0].NodeName == "TR"; 554 | 555 | //does an anchor point to Wikidata? 556 | private bool IsWikiDataLink(HtmlElement element) 557 | => element.GetAttribute("href")?.Contains("//www.wikidata.org/") ?? false; 558 | 559 | private void ParseMulticolmnTable(HtmlElement table) 560 | { 561 | table.Children[0].Children[0].Children 562 | .Where(x => x.NodeName == "TD").ToList() 563 | .ForEach(x => ParseChildern(x)); 564 | } 565 | 566 | /// 567 | /// See if a list element can be converted to a link and output it to the buffer. 568 | /// Returns if list items was converted to a link or not 569 | /// 570 | /// 571 | /// 572 | private bool TryConvertingToLink(HtmlElement li) 573 | { 574 | if (ConvertListItems) 575 | { 576 | //if an list item starts with a link, make it a link 577 | var links = li.QuerySelectorAll("a").ToList(); 578 | if (links.Count > 0 && ArticleLinkCollection.ShouldUseLink(links[0]) && li.TextContent.StartsWith(links[0].TextContent)) 579 | { 580 | HasGeminiFormatting = true; 581 | buffer.EnsureAtLineStart(); 582 | buffer.SetLineStart($"=> {RouteOptions.ArticleUrl(links[0].GetAttribute("title"))} "); 583 | ParseChildern(li); 584 | buffer.EnsureAtLineStart(); 585 | return true; 586 | } 587 | } 588 | return false; 589 | } 590 | } 591 | --------------------------------------------------------------------------------