");
15 |
16 | public static string GetRedirectTitle(string html)
17 | {
18 | Match match = redirectTitle.Match(html);
19 | if (match.Success)
20 | {
21 | return match.Groups[1].Value;
22 | }
23 | return "";
24 | }
25 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/NavigationParser.cs:
--------------------------------------------------------------------------------
1 | using AngleSharp.Html.Dom;
2 | using Gemipedia.Models;
3 |
4 | namespace Gemipedia.Converter.Special;
5 |
6 | ///
7 | /// parses navigation notes
8 | ///
9 | public class NavigationParser
10 | {
11 | ///
12 | /// Convert a navigation note in a section
13 | ///
14 | ///
15 | ///
16 | public static NavSuggestionsItem ConvertNavigationNote(HtmlElement element)
17 | {
18 | var textExtractor = new TextExtractor
19 | {
20 | ShouldCollapseNewlines = true
21 | };
22 | textExtractor.Extract(element);
23 | return new NavSuggestionsItem(textExtractor);
24 | }
25 | }
--------------------------------------------------------------------------------
/Gemipedia/API/Models/ArticleSummary.cs:
--------------------------------------------------------------------------------
1 | using System;
2 |
3 | namespace Gemipedia.API.Models;
4 |
5 | public class ArticleSummary
6 | {
7 |
8 | public string Title { get; set; }
9 |
10 | public long PageId { get; set; }
11 |
12 | public string Description { get; set; }
13 |
14 | public string ThumbnailUrl { get; set; }
15 |
16 | //distance in meters from where you were searching
17 | public int Distance { get; set; } = -1;
18 |
19 | //only used when looking for same article on other Wikipedias
20 | public string LanguageCode { get; set; }
21 |
22 | ///
23 | /// Snippet of text where search term was found. Usually less helpful than description
24 | ///
25 | public string Excerpt { get; set; }
26 |
27 | public bool HasSummary
28 | => !string.IsNullOrEmpty(SummaryText);
29 |
30 | public string SummaryText
31 | => !String.IsNullOrEmpty(Description) ? Description : Excerpt;
32 | }
33 |
--------------------------------------------------------------------------------
/Test Cases.txt:
--------------------------------------------------------------------------------
1 | Area of a circle
2 | Math in MathML
3 | Math in texhtml
4 | SUP
5 | SUB
6 |
7 | Broadway theatre
8 | image map
9 |
10 | Chip 'n Dale: Rescue Rangers (film)
11 | Montage images (also WW II)
12 |
13 | Cohomology
14 | Math (in DIVs not spans)
15 | SUP
16 |
17 | "Hello, World!" program
18 | PRE tags
19 |
20 | iPad
21 | Multiple navigation suggestions
22 | Timeline
23 |
24 |
25 | Minor League Baseball
26 | Image with overlays
27 | tables
28 | tables for layout
29 | nested lists
30 |
31 | Pablo Picasso
32 | Gallerys
33 |
34 | Physical constant
35 | SUB that doesn't directly translate
36 |
next to to be avoided
37 |
38 | Schitt's Creek
39 | table with colspans and rowspans
40 |
41 | Signal (software)
42 | nested tables in infobox
43 | montage with single caption
44 | links to wikidata
45 |
46 | Unicode subscripts and superscripts
47 | nested tables
48 |
49 | List of Wikipedias
50 | Has table with wide unicode characters
51 |
--------------------------------------------------------------------------------
/Gemipedia/Renderer/SimpleBuffer.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace Gemipedia.Renderer;
4 |
5 | public class SimpleBuffer
6 | {
7 | public string Content => sb.ToString();
8 |
9 | public bool HasContent => (sb.Length > 0);
10 |
11 | public bool AtLineStart
12 | => !HasContent || Content.EndsWith('\n');
13 |
14 | private StringBuilder sb;
15 |
16 | public SimpleBuffer()
17 | {
18 | sb = new StringBuilder();
19 | }
20 |
21 | public void Reset()
22 | => sb.Clear();
23 |
24 | public void Append(string s)
25 | => sb.Append(s);
26 |
27 | public void AppendLine(string s = "")
28 | => sb.AppendLine(s);
29 |
30 | public void PrependLine(string s = "")
31 | {
32 | var existing = sb.ToString();
33 | sb.Clear();
34 | sb.AppendLine(s);
35 | sb.Append(existing);
36 | }
37 |
38 | public void EnsureAtLineStart()
39 | {
40 | if (!AtLineStart)
41 | {
42 | sb.AppendLine();
43 | }
44 | }
45 | }
--------------------------------------------------------------------------------
/Gemipedia/Renderer/GalleryRenderer.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using Gemipedia.Models;
3 |
4 | namespace Gemipedia.Renderer;
5 |
6 | public class GalleryRenderer
7 | {
8 | TextWriter Writer;
9 | ParsedPage Page;
10 |
11 | public void RenderGallery(ParsedPage parsedPage, TextWriter writer)
12 | {
13 | Writer = writer;
14 | Page = parsedPage;
15 | Writer.WriteLine($"# Image Gallery: {Page.Title}");
16 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article");
17 | Writer.WriteLine();
18 | foreach(var media in Page.GetAllImages())
19 | {
20 | if (media is VideoItem)
21 | {
22 | var video = (VideoItem)media;
23 | Writer.WriteLine($"=> {video.Url} Video Still: {video.Caption}");
24 | Writer.WriteLine($"=> {video.VideoUrl} Source Video: {video.VideoDescription}"); ;
25 | }
26 | else
27 | {
28 | Writer.WriteLine($"=> {media.Url} {media.Caption}");
29 | }
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Acidus
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/WikiTableConverter.cs:
--------------------------------------------------------------------------------
1 | using AngleSharp.Html.Dom;
2 | using Gemipedia.Converter.Special.Tables;
3 | using Gemipedia.Models;
4 |
5 | namespace Gemipedia.Converter.Special;
6 |
7 | public static class WikiTableConverter
8 | {
9 | ///
10 | /// Convert a data table
11 | ///
12 | ///
13 | ///
14 | public static SectionItem ConvertWikiTable(HtmlElement element)
15 | {
16 |
17 | //do we have a timeline?
18 | var media = MediaParser.ConvertTimelineInTable(element);
19 | if (media != null)
20 | {
21 | return media;
22 | }
23 |
24 | TableParser tableParser = new TableParser();
25 | var table = tableParser.ParseTable(element);
26 |
27 | var contents = TableRenderer.RenderTable(table);
28 | if (contents.Length > 0)
29 | {
30 | return new ContentItem
31 | {
32 | Content = contents,
33 | Links = tableParser.Links
34 | };
35 | }
36 | return null;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/Gemipedia/UserOptions.cs:
--------------------------------------------------------------------------------
1 | namespace Gemipedia;
2 |
3 | public static class UserOptions
4 | {
5 | ///
6 | /// Set which version of Wikipedia we should use. Defaults to EN
7 | ///
8 | public static string WikipediaVersion { get; set; } = "en";
9 |
10 | public static string LangaugeName => LanguageUtils.GetName(WikipediaVersion);
11 |
12 | //these will depend on the language
13 | public static string[] ExcludedSections
14 | => GetExclusedSections(WikipediaVersion);
15 |
16 | public static string[] ArticleLinkSections
17 | => GetArticleLinkSections(WikipediaVersion);
18 |
19 | static string[] GetExclusedSections(string language)
20 | {
21 | switch (language)
22 | {
23 | default:
24 | return new string[] { "bibliography", "citations", "external_links", "notes", "references", "further_reading" };
25 | }
26 | }
27 |
28 | static string[] GetArticleLinkSections(string language)
29 | {
30 | switch (language)
31 | {
32 | default:
33 | return new string[] { "see also" };
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/MathConverter.cs:
--------------------------------------------------------------------------------
1 | using AngleSharp.Html.Dom;
2 |
3 | namespace Gemipedia.Converter.Special;
4 |
5 | public static class MathConverter
6 | {
7 | ///
8 | /// Attempts to convert an inline Math element into a linkable image
9 | /// Math formulas are in SVG, so link to our converter
10 | ///
11 | ///
12 | ///
13 | public static string ConvertMath(HtmlElement element)
14 | {
15 | var img = element.QuerySelector("img");
16 | var url = img?.GetAttribute("src") ?? "";
17 | var caption = img?.GetAttribute("alt").Trim().Replace("\n", "") ?? "";
18 |
19 | if (url.Length > 0 && caption.Length > 0)
20 | {
21 | //not a media item, since it shouldn't be moved
22 | return $"=> {RouteOptions.MediaProxyUrl(MathSvgUrlAsPng(url))} Math Formula: {CleanLatex(caption)}";
23 | }
24 | return "";
25 | }
26 |
27 | //wikipedia has direct PNG versions of the SVG math images
28 | private static string MathSvgUrlAsPng(string url)
29 | => url.Replace("/svg/", "/png/");
30 |
31 | private static string CleanLatex(string latex)
32 | => latex.Replace(@"\displaystyle ", "");
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/TODOs.txt:
--------------------------------------------------------------------------------
1 | Features:
2 |
3 | - requestor only works on happy path. no HTTP/connection/DNS error handling
4 | - stream media bytes directly to client instead of current "store and forward"
5 | - Definitions via Wikitionary: https://en.wiktionary.org/api/rest_v1/#/Page%20content/get_page_definition__term_
6 | - Landing page:
7 | - Random articles in topcis?
8 |
9 | Better Rendering:
10 | - don't add links to list unless its a "See Also"
11 | - add "related articles" into content (separate page?)
12 | - Better selection of default video type (lots of devices can't do ogg, etc)
13 | - table horizontal lines don't have "+" in right place for colspans
14 | - Article name is wrong (IPad instead of iPad). This is because of the API I am using
15 |
16 | Rendering bugs:
17 | - Empty list entries (Karl Marx infobox)
18 | - tables nested inside tables (Vietnam administrative districts)
19 |
20 | "Sources" section not removed
21 | - "apollo" article
22 |
23 | Blockquote bug:
24 | "Hercule Poirot" in "Appearance and proclivities"
25 |
26 | Infobox:
27 | - handle places that use single row tables as multi columns (band album chronologies use this to show prev, curren and next album
28 |
29 | Eve Jobs
30 | -> photo seems small
31 |
32 |
33 |
34 |
35 | https://www.mediawiki.org/wiki/Specs/HTML/2.7.0#Wiki_links
--------------------------------------------------------------------------------
/Gemipedia/Models/InfoboxItem.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 |
4 | namespace Gemipedia.Models;
5 |
6 | public class InfoboxItem : SectionItem, IArticleLinks
7 | {
8 | public string CustomTitle { get; set; } = "";
9 |
10 | public ArticleLinkCollection Links { get; private set; } = new ArticleLinkCollection();
11 |
12 | public IEnumerable ContentItems
13 | => Items.Where(x => x is ContentItem).Select(x => x as ContentItem);
14 |
15 | public IEnumerable GeoItems
16 | => Items.Where(x => x is GeoItem).Select(x => x as GeoItem);
17 |
18 | public IEnumerable MediaItems
19 | => Items.Where(x => x is MediaItem).Select(x => x as MediaItem);
20 |
21 | public IEnumerable NavSuggestions
22 | => Items.Where(x => x is NavSuggestionsItem).Select(x => x as NavSuggestionsItem);
23 |
24 | private List Items = new List();
25 |
26 | //force processing
27 | public void AddItems(IEnumerable items)
28 | => items.ToList().ForEach(x => AddItem(x));
29 |
30 | public void AddItem(SectionItem item)
31 | {
32 | if(item == null)
33 | {
34 | return;
35 | }
36 |
37 | if (item is IArticleLinks && ((IArticleLinks)item).Links != null && !(item is NavSuggestionsItem))
38 | {
39 | Links.Add(((IArticleLinks)item).Links);
40 | }
41 | Items.Add(item);
42 | }
43 | }
--------------------------------------------------------------------------------
/Gemipedia/Renderer/ContentRenderer.cs:
--------------------------------------------------------------------------------
1 | using Gemipedia.Models;
2 |
3 | namespace Gemipedia.Renderer;
4 |
5 | public static class ContentRenderer
6 | {
7 | public static void RenderGeo(SimpleBuffer buffer, GeoItem geo)
8 | {
9 | buffer.EnsureAtLineStart();
10 | buffer.AppendLine($"=> {geo.Url} 🌍 {geo.Title}");
11 | }
12 |
13 | public static void RenderMedia(SimpleBuffer buffer, MediaItem media)
14 | {
15 | buffer.EnsureAtLineStart();
16 |
17 | if (media is VideoItem)
18 | {
19 | var video = (VideoItem)media;
20 | buffer.AppendLine($"=> {video.Url} Video Still: {video.Caption}");
21 | buffer.AppendLine($"=> {video.VideoUrl} Source Video: {video.VideoDescription}"); ;
22 | }
23 | else
24 | {
25 | buffer.AppendLine($"=> {media.Url} {media.Caption}");
26 | }
27 | }
28 |
29 | public static void RenderNavSuggestion(SimpleBuffer buffer, NavSuggestionsItem nav)
30 | {
31 | var links = nav.Links.GetLinks();
32 | if (links.Count == 1)
33 | {
34 | buffer.EnsureAtLineStart();
35 | buffer.AppendLine($"=> {RouteOptions.ArticleUrl(links[0])} {nav.Content}");
36 | }
37 | else
38 | {
39 | buffer.EnsureAtLineStart();
40 | buffer.AppendLine($"({nav.Content})");
41 | foreach (var linkTitle in links)
42 | {
43 | buffer.AppendLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}");
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/Gemipedia/LanguageUtils.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Globalization;
3 |
4 | namespace Gemipedia;
5 |
6 | public static class LanguageUtils
7 | {
8 | public static readonly string[] CommonLanguages = new string[] { "simple", "ar", "bg", "ca", "ce", "cs", "da", "nl", "en", "eo", "fi", "fr", "de", "he", "hu", "id", "it", "ja", "ko", "ms", "zh", "no", "ga", "pl", "pt", "ro", "ru", "sr", "sh", "es", "sv", "tr", "uk", "vi" };
9 |
10 | public static string GetName(string langCode)
11 | {
12 | // Special logic for Simple English Wikipedia.
13 | if (IsSimpleEnglish(langCode))
14 | {
15 | return "Simple English";
16 | }
17 |
18 | try
19 | {
20 | var ci = new CultureInfo(langCode);
21 | return ci.NativeName == ci.DisplayName ?
22 | ci.NativeName :
23 | $"{ci.NativeName} ({ci.DisplayName})";
24 | }
25 | catch (Exception)
26 | {
27 |
28 | }
29 | return $"'{langCode}'";
30 | }
31 |
32 | public static bool IsValidCode(string langCode)
33 | {
34 | // Special logic for Simple English Wikipedia.
35 | if (IsSimpleEnglish(langCode))
36 | {
37 | return true;
38 | }
39 |
40 | try
41 | {
42 | var ci = new CultureInfo(langCode);
43 | return ci.DisplayName != langCode;
44 | }
45 | catch (Exception)
46 | { }
47 | return false;
48 | }
49 |
50 | private static bool IsSimpleEnglish(string langCode)
51 | => langCode.ToLower() == "simple";
52 | }
--------------------------------------------------------------------------------
/Gemipedia/Gemipedia.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 |
6 |
7 |
8 | False
9 | None
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/Gemipedia/CommonUtils.cs:
--------------------------------------------------------------------------------
1 | using System.Linq;
2 | using System.Text.RegularExpressions;
3 | using AngleSharp.Dom;
4 |
5 | namespace Gemipedia;
6 |
7 | public static class CommonUtils
8 | {
9 | public static string PrepareTextContent(string s)
10 | => s.Trim().Replace("\n", "");
11 |
12 |
13 | ///
14 | /// Gets a properly formatted image URL from an IMG object
15 | ///
16 | ///
17 | ///
18 | public static string GetImageUrl(IElement img)
19 | {
20 | //try srcset 2x
21 | var url = GetImageFromSrcset(img?.GetAttribute("srcset") ?? "", "2x");
22 | if (!string.IsNullOrEmpty(url))
23 | {
24 | return EnsureHttps(url);
25 | }
26 | //try srcset 1.5
27 | url = GetImageFromSrcset(img?.GetAttribute("srcset") ?? "", "1.5x");
28 | if (!string.IsNullOrEmpty(url))
29 | {
30 | return EnsureHttps(url);
31 | }
32 | return EnsureHttps(img.GetAttribute("src") ?? null);
33 | }
34 |
35 | public static string EnsureHttps(string url)
36 | => (url != null && !url.StartsWith("https:")) ?
37 | "https:" + url :
38 | url;
39 |
40 | private static string GetImageFromSrcset(string srcset, string size)
41 | {
42 | if (srcset.Length > 0)
43 | {
44 | Regex parser = new Regex(@"(\S*[^,\s])(\s+([\d.]+)(x|w))?");
45 |
46 | return parser.Matches(srcset)
47 | .Where(x => x.Success && x.Groups[2].Value.Trim() == size)
48 | .Select(x => x.Groups[1].Value).FirstOrDefault() ?? null;
49 | }
50 | return null;
51 | }
52 |
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/Gemipedia/Media/MediaProcessor.cs:
--------------------------------------------------------------------------------
1 | using ImageMagick;
2 |
3 | namespace Gemipedia.Media;
4 |
5 | ///
6 | /// Reformats media from Wikipedia to better suit Gemini clients
7 | ///
8 | public static class MediaProcessor
9 | {
10 | public static MediaContent ProcessImage(byte[] data)
11 | {
12 | using (var image = new MagickImage(data))
13 | {
14 |
15 | if (image.Format == MagickFormat.Svg)
16 | {
17 | //convert it to PNG
18 | image.Format = MagickFormat.Png;
19 | return ToContent(image);
20 | }
21 | else if (!image.IsOpaque)
22 | {
23 | //add a white background to transparent images to
24 | //make them visible on clients with a dark theme
25 | image.BackgroundColor = new MagickColor("white");
26 | image.Alpha(AlphaOption.Remove);
27 | return ToContent(image);
28 | }
29 | else
30 | {
31 | //nothing needed (e.g. JPG, etc) so pass it through
32 | return new MediaContent
33 | {
34 | Data = data,
35 | MimeType = GetMime(image)
36 | };
37 | }
38 | }
39 | }
40 |
41 | private static string GetMime(MagickImage image)
42 | {
43 | string? mimeType = MagickFormatInfo.Create(image.Format)?.MimeType;
44 | return mimeType ?? "image/png";
45 | }
46 |
47 | private static MediaContent ToContent(MagickImage image)
48 | => new MediaContent
49 | {
50 | Data = image.ToByteArray(),
51 | MimeType = GetMime(image)
52 | };
53 | }
54 |
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/GeoParser.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using AngleSharp.Dom;
3 | using Gemipedia.Models;
4 |
5 | namespace Gemipedia.Converter.Special;
6 |
7 | public static class GeoParser
8 | {
9 | public const string GeohackHostname = "geohack.toolforge.org";
10 |
11 | public static bool IsGeoLink(IElement anchor)
12 | {
13 | //only external links can be a link to geohack.
14 | //This fast-fails so we don't parse a bunch of relative, local, URLs
15 | if (!(anchor.GetAttribute("class")?.Contains("external") ?? false))
16 | {
17 | return false;
18 | }
19 | return IsGeohackUrl(anchor.GetAttribute("href"));
20 | }
21 |
22 | ///
23 | /// Is this url a valid link to the Wikipedia Geohack server?
24 | ///
25 | ///
26 | ///
27 | public static bool IsGeohackUrl(string? url)
28 | {
29 | if (url == null)
30 | {
31 | return false;
32 | }
33 |
34 | try
35 | {
36 | Uri parsedUrl = new Uri(url);
37 | return (parsedUrl.IsAbsoluteUri && parsedUrl.Host == GeohackHostname);
38 | }
39 | catch (Exception)
40 | {
41 | }
42 | return false;
43 | }
44 |
45 | public static GeoItem ParseGeo(IElement anchor)
46 | {
47 | string url = anchor.GetAttribute("href");
48 | url = CommonUtils.EnsureHttps(url);
49 |
50 | GeohackParser geohack = new GeohackParser(url);
51 | if (geohack.IsValid)
52 | {
53 | return new GeoItem
54 | {
55 | Title = $"View Geographic Info: {geohack.GetPrettyName()} ({geohack.Coordinates})",
56 | Url = RouteOptions.GeoUrl(geohack.GeohackUrl)
57 | };
58 | }
59 | return null;
60 | }
61 | }
62 |
63 |
--------------------------------------------------------------------------------
/Gemipedia/Models/Section.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Diagnostics;
3 | using System.Linq;
4 | using AngleSharp.Dom;
5 |
6 | namespace Gemipedia.Models;
7 |
8 | [DebuggerDisplay("Section '{Title}'")]
9 | public class Section : IArticleLinks
10 | {
11 | public List Nodes = new List();
12 |
13 | public ArticleLinkCollection Links { get; private set; } = new ArticleLinkCollection();
14 |
15 | public bool HasSubSections => (SubSections.Count > 0);
16 |
17 | //infoboxes
18 | public List Infoboxes = new List();
19 |
20 | //content and images
21 | public List GeoItems = new List();
22 |
23 | //content and images
24 | public List GeneralContent = new List();
25 |
26 | public bool HasNavSuggestions
27 | => NavSuggestions.Count > 0;
28 |
29 | public List NavSuggestions = new List();
30 |
31 | public List SubSections { get; set; }= new List();
32 |
33 | //force processing
34 | public void AddItems(IEnumerable items)
35 | => items.ToList().ForEach(x => AddItem(x));
36 |
37 | private void AddItem(SectionItem item)
38 | {
39 | if (item is IArticleLinks && !(item is NavSuggestionsItem))
40 | {
41 | Links.Add(((IArticleLinks)item).Links);
42 | }
43 |
44 | if (item is InfoboxItem)
45 | {
46 | Infoboxes.Add((InfoboxItem)item);
47 | }
48 | else if (item is NavSuggestionsItem)
49 | {
50 | NavSuggestions.Add((NavSuggestionsItem)item);
51 | }
52 | else if (item is GeoItem)
53 | {
54 | GeoItems.Add((GeoItem)item);
55 |
56 | }
57 | else
58 | {
59 | GeneralContent.Add(item);
60 | }
61 | }
62 |
63 | //special sections don't have titles. the intro section is a special section
64 | public bool IsSpecial { get; set; } = false;
65 |
66 | public int SectionDepth { get; set; }
67 | public string Title { get; set; }
68 |
69 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Buffer.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 | using Gemipedia.Models;
3 |
4 | namespace Gemipedia.Converter;
5 |
6 | public class Buffer : ITextContent
7 | {
8 | public ArticleLinkCollection Links { get; private set; }
9 |
10 | public string Content => sb.ToString();
11 |
12 | public bool HasContent => (sb.Length > 0);
13 |
14 | public bool AtLineStart
15 | => !HasContent || Content.EndsWith('\n');
16 |
17 | public bool InBlockquote { get; set; } = false;
18 |
19 | private StringBuilder sb;
20 |
21 | private string lineStart = null;
22 |
23 | public Buffer()
24 | {
25 | sb = new StringBuilder();
26 | Links = new ArticleLinkCollection();
27 | }
28 |
29 | public void Reset()
30 | {
31 | sb.Clear();
32 | Links = new ArticleLinkCollection();
33 | lineStart = null;
34 | }
35 |
36 | public void SetLineStart(string s)
37 | {
38 | lineStart = s;
39 | }
40 |
41 | public void Append(ITextContent textContent)
42 | {
43 | //start consume the data
44 | sb.Append(textContent.Content);
45 | Links.Add(textContent.Links);
46 | }
47 |
48 | public void Append(string s)
49 | {
50 | HandleLineStart(s);
51 | HandleBlockQuote(s);
52 | sb.Append(s);
53 | }
54 |
55 | public void AppendLine(string s = "")
56 | {
57 | HandleLineStart(s);
58 | HandleBlockQuote(s);
59 | sb.AppendLine(s);
60 | }
61 |
62 | public void EnsureAtLineStart()
63 | {
64 | if (AtLineStart && lineStart != null)
65 | {
66 | lineStart = null;
67 | }
68 |
69 | if (!AtLineStart)
70 | {
71 | sb.AppendLine();
72 | }
73 | }
74 |
75 | public void HandleLineStart(string s)
76 | {
77 | //if we are adding something that is not whitespace, and we have a prefix
78 | if (lineStart != null)
79 | {
80 | sb.Append(lineStart);
81 | lineStart = null;
82 | }
83 | }
84 |
85 | private void HandleBlockQuote(string s)
86 | {
87 | if (InBlockquote && AtLineStart && s.Trim().Length > 0)
88 | {
89 | sb.Append(">");
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/Gemipedia/Renderer/GeoRenderer.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using Gemipedia.Converter.Special;
3 |
4 | namespace Gemipedia.Renderer;
5 |
6 | public class GeoRenderer
7 | {
8 | TextWriter Writer;
9 |
10 | public void RenderGeo(GeohackParser geohack, TextWriter writer)
11 | {
12 | Writer = writer;
13 |
14 | Writer.WriteLine($"# Geographic Info for {geohack.GetPrettyName()}");
15 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(geohack.ArticleName)} Back to article");
16 | Writer.WriteLine();
17 | Writer.WriteLine($"Place: {geohack.GetPrettyName()}");
18 | if(!geohack.IsEarth)
19 | {
20 | Writer.WriteLine($"Globe: {geohack.Globe}");
21 | }
22 | if(geohack.HasTypeDescription)
23 | {
24 | Writer.WriteLine($"Type: {geohack.GetTypeDescription()}");
25 | }
26 | Writer.WriteLine($"Coordinates:");
27 | Writer.WriteLine($"* Latitude: {geohack.Latitude.ToString("#.####")}");
28 | Writer.WriteLine($"* Longitude: {geohack.Longitude.ToString("#.####")}");
29 | Writer.WriteLine();
30 |
31 | if (geohack.IsEarth)
32 | {
33 | Writer.WriteLine("## Mapping");
34 | Writer.WriteLine($"=> {OpenStreetMAps(geohack)} Open in OpenStreetMaps.org");
35 | Writer.WriteLine($"=> {AppleMapsUrl(geohack)} Open in Apple Maps app");
36 | Writer.WriteLine($"=> {GeoUrl(geohack)} Open in default Andriod Maps app (uses geo: URI)");
37 | Writer.WriteLine();
38 | }
39 |
40 | Writer.WriteLine("## Extras");
41 | Writer.WriteLine($"=> {geohack.GeohackUrl} Open in GeoHack Launcher");
42 | Writer.WriteLine($"=> {RouteOptions.LonLatUrl(geohack.Latitude, geohack.Longitude, geohack.ArticleName)} Search for nearby articles");
43 | }
44 |
45 | private string AppleMapsUrl(GeohackParser geohack)
46 | => $"https://maps.apple.com/?q={geohack.Latitude},{geohack.Longitude}&t=m";
47 |
48 | private string GeoUrl(GeohackParser geohack)
49 | => $"geo:{geohack.Latitude},{geohack.Longitude}?z=5";
50 |
51 | private string OpenStreetMAps(GeohackParser geohack)
52 | => $"https://www.openstreetmap.org/?mlat={geohack.Latitude}&mlon={geohack.Longitude}&zoom=15";
53 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/WikiHtmlConverter.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Diagnostics;
3 | using Gemipedia.Converter.Filter;
4 | using Gemipedia.Models;
5 |
6 | namespace Gemipedia.Converter;
7 |
8 | ///
9 | /// Takes HTML generated by media wiki, and converts it into GemText
10 | ///
11 | public class WikiHtmlConverter
12 | {
13 | public long ConvertTimeMs
14 | => convertTimer.ElapsedMilliseconds;
15 |
16 | Stopwatch convertTimer;
17 |
18 | public WikiHtmlConverter()
19 | {
20 | convertTimer = new Stopwatch();
21 | LoadDomFilters();
22 | }
23 |
24 | private void LoadDomFilters()
25 | {
26 | DomFilter.Global = new DomFilter();
27 | //locmaps have overlays we can't display
28 | DomFilter.Global.AddRule("div.locmap");
29 | //if its not for mobile don't display
30 | DomFilter.Global.AddRule(".nomobile");
31 | //side category and meta index tables
32 | DomFilter.Global.AddRule("table.sidebar");
33 | DomFilter.Global.AddRule("table.navbox-vertical");
34 | //dialogs at top that say something is wrong with the article
35 | DomFilter.Global.AddRule(".metadata");
36 | DomFilter.Global.AddRule("div.navbox");
37 | //geo
38 | DomFilter.Global.AddRule("span#coordinates");
39 | //hidden content
40 | DomFilter.Global.AddRule("div.mw-collapsed");
41 | }
42 |
43 | public ParsedPage Convert(string title, string wikiHtml)
44 | {
45 | convertTimer.Start();
46 | var contentRoot = Preparer.PrepareHtml(wikiHtml);
47 |
48 | Sectionizer sectionizer = new Sectionizer();
49 |
50 | var parsedPage = sectionizer.ParseContent(title, contentRoot);
51 |
52 | ConvertSections(parsedPage.Sections);
53 | convertTimer.Stop();
54 | return parsedPage;
55 | }
56 |
57 | private void ConvertSections(List sections)
58 | => sections.ForEach(x => ConvertSection(x));
59 |
60 | private void ConvertSection(Section section)
61 | {
62 | HtmlParser htmlParser = new HtmlParser();
63 |
64 | while (section.Nodes.Count > 0)
65 | {
66 | htmlParser.Parse(section.Nodes[0]);
67 | section.Nodes.RemoveAt(0);
68 | }
69 |
70 | section.AddItems(htmlParser.GetItems());
71 | ConvertSections(section.SubSections);
72 | }
73 | }
74 |
75 |
--------------------------------------------------------------------------------
/Gemipedia.Cgi/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using Gemini.Cgi;
3 |
4 | namespace Gemipedia.Cgi;
5 |
6 | class Program
7 | {
8 | static void Main(string[] args)
9 | {
10 | SetPaths();
11 |
12 | CgiRouter router = new CgiRouter(ParseWikiLanguage);
13 | router.OnRequest("/search", RouteHandler.Search);
14 | router.OnRequest("/view", RouteHandler.ViewArticle);
15 | router.OnRequest("/images", RouteHandler.ViewImages);
16 | router.OnRequest("/media", RouteHandler.ProxyMedia);
17 | router.OnRequest("/refs", RouteHandler.ViewRefs);
18 | router.OnRequest("/featured", RouteHandler.ViewFeatured);
19 | router.OnRequest("/geo", RouteHandler.ViewGeo);
20 | router.OnRequest("/latlon", RouteHandler.SearchLatLon);
21 | router.OnRequest("/lang", RouteHandler.SelectLanguage);
22 | router.OnRequest("/otherlang", RouteHandler.ViewOtherLanguages);
23 | router.OnRequest("/setlang", RouteHandler.SetLanguage);
24 | router.OnRequest("/random", RouteHandler.ViewRandomArticle);
25 | router.OnRequest("/", RouteHandler.Welcome);
26 | router.ProcessRequest();
27 | }
28 |
29 | static void ParseWikiLanguage(CgiWrapper cgi)
30 | {
31 | var parts = cgi.PathInfo.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
32 | if (parts.Length == 2 && LanguageUtils.IsValidCode(parts[1]))
33 | {
34 | UserOptions.WikipediaVersion = parts[1].ToLower();
35 | }
36 | }
37 |
38 | static void SetPaths()
39 | {
40 | RouteOptions.BaseArticleUrl = "/cgi-bin/wp.cgi/view";
41 | RouteOptions.BaseFeaturedContenteUrl = "/cgi-bin/wp.cgi/featured";
42 | RouteOptions.BaseGeoUrl = "/cgi-bin/wp.cgi/geo";
43 | RouteOptions.BaseImageGallerUrl = "/cgi-bin/wp.cgi/images";
44 | RouteOptions.BaseLanguageUrl = "/cgi-bin/wp.cgi/lang";
45 | RouteOptions.BaseLonLatUrl = "/cgi-bin/wp.cgi/latlon";
46 | RouteOptions.BaseMediaProxyUrl = "/cgi-bin/wp.cgi/media/media";
47 | RouteOptions.BaseOtherLanguagesUrl = "/cgi-bin/wp.cgi/otherlang";
48 | RouteOptions.BaseRandomArticleUrl = "/cgi-bin/wp.cgi/random";
49 | RouteOptions.BaseReferencesUrl = "/cgi-bin/wp.cgi/refs";
50 | RouteOptions.BaseSearchUrl = "/cgi-bin/wp.cgi/search";
51 | RouteOptions.BaseSetLanguageUrl = "/cgi-bin/wp.cgi/setlang";
52 | RouteOptions.BaseWelcomeUrl = "/cgi-bin/wp.cgi/welcome";
53 | }
54 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Preparer.cs:
--------------------------------------------------------------------------------
1 | using System.Linq;
2 | using AngleSharp;
3 | using AngleSharp.Dom;
4 | using AngleSharp.Html.Parser;
5 |
6 | namespace Gemipedia.Converter;
7 |
8 | ///
9 | /// Reads in the Raw HTML, converts it to a DOM, and strips out
10 | /// tags that we don't want before proper parsing
11 | ///
12 | public static class Preparer
13 | {
14 | public static IElement PrepareHtml(string wikiHtml)
15 | {
16 | //step 1: scope Html just to article content
17 | var contentRoot = GetContentRoot(wikiHtml);
18 |
19 | //step 2: remove known bad/unneeded tags
20 | RemoveTags(contentRoot);
21 |
22 | return contentRoot;
23 | }
24 |
25 | private static IElement GetContentRoot(string wikiHtml)
26 | {
27 | var context = BrowsingContext.New(Configuration.Default);
28 | var parser = context.GetService();
29 | var document = parser.ParseDocument(wikiHtml);
30 | return document.QuerySelector("div.mw-parser-output");
31 | }
32 |
33 | //Removes tags we no we want need, and which make rendering harder
34 | //often we want to complete remove tags instead of skipping them later
35 | ////with the Filter, since InfoBox parser won't already visit every element
36 | private static void RemoveTags(IElement contentRoot)
37 | {
38 | //remove the table of contents
39 | RemoveMatchingTags(contentRoot, "#toc");
40 |
41 | //all tags are used to link to references.
42 | RemoveMatchingTags(contentRoot, "sup.reference");
43 | //all span holders for flag icons
44 | RemoveMatchingTags(contentRoot, "span.flagicon");
45 | //all tags
46 | RemoveMatchingTags(contentRoot, "link");
47 | //all style tags
48 | RemoveMatchingTags(contentRoot, "style");
49 | //geo meta data
50 | RemoveMatchingTags(contentRoot, "span.geo-nondefault");
51 | RemoveMatchingTags(contentRoot, "span.geo-multi-punct");
52 | //citation need and other tags
53 | RemoveMatchingTags(contentRoot, ".noprint");
54 | RemoveMatchingTags(contentRoot, ".mbox");
55 | RemoveMatchingTags(contentRoot, ".mbox-small");
56 | //remove the "V T E" meta navbars on certain items
57 | RemoveMatchingTags(contentRoot, ".navbar");
58 |
59 | //remove interactive elements
60 | RemoveMatchingTags(contentRoot, "div.switcher-container");
61 | }
62 |
63 | private static void RemoveMatchingTags(IElement element, string selector)
64 | => element.QuerySelectorAll(selector).ToList().ForEach(x => x.Remove());
65 |
66 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/Tables/Table.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using System.Text;
4 |
5 | namespace Gemipedia.Converter.Special.Tables;
6 |
7 |
8 | public class Table
9 | {
10 | public string Caption = "";
11 | public List Rows = new List();
12 |
13 | public bool HasCaption
14 | => Caption.Length > 0;
15 |
16 | public bool IsEmpty
17 | => (Rows.Count == 0);
18 |
19 | ///
20 | /// How many column units wide is this table. Many tables have rows with
21 | /// a mismatched number of columns, or too many colspans. Assume that
22 | /// the number of column units in the first row is accurate, and trim
23 | /// other rows accordingly
24 | ///
25 | public int MaxColumns
26 | => (Rows.Count > 0) ?
27 | Rows[0].Cells.Sum(x => x.ColSpan) :
28 | 0;
29 | }
30 |
31 | public class Row
32 | {
33 | public List Cells = new List();
34 |
35 | public bool IsEmpty => (Cells.Count == 0);
36 |
37 | public int LineHeight => IsEmpty
38 | ? 0 : Cells.Max(x => x.LineHeight);
39 | }
40 |
41 | public class Cell
42 | {
43 | public bool IsHeader = false;
44 |
45 | private string contents = "";
46 | public string Contents
47 | {
48 | get => contents;
49 | set
50 | {
51 | contents = StripZeroWidth(value);
52 | }
53 | }
54 |
55 | public int ColSpan = 1;
56 |
57 | public int RowSpan = 1;
58 | //is this a dummy cell, only present to hold open a row spanning cell from a row above?
59 | public bool IsRowSpanHolder = false;
60 |
61 | public List FormattedLines;
62 |
63 | public int LineHeight
64 | => FormattedLines?.Count ?? 0;
65 |
66 | public int FormattedWidth
67 | => (FormattedLines?.Count > 0) ? UnicodeString.GetWidth(FormattedLines[0]) : 0;
68 |
69 | ///
70 | /// removes any zero-width unicode characters from the string
71 | /// these will mess with our column layout since .Lenth with return a number
72 | /// longer than the number of characters that are rendered
73 | ///
74 | ///
75 | ///
76 | private string StripZeroWidth(string s)
77 | {
78 | //Replace("\u200b", "") does not appear to work for these unicode characters
79 | //do it char by char
80 | var sb = new StringBuilder(s.Length);
81 | foreach(char c in s)
82 | {
83 | if(c == '\u200b' || c == '\ufeff')
84 | {
85 | continue;
86 | }
87 | sb.Append(c);
88 | }
89 | return sb.ToString();
90 | }
91 | }
--------------------------------------------------------------------------------
/Gemipedia/Models/ParsedPage.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 |
5 | namespace Gemipedia.Models;
6 |
7 | public class ParsedPage
8 | {
9 | private int currSection = 0;
10 |
11 | public String Title { get; set; }
12 |
13 | ///
14 | /// returns an underline escaped version of the title, used by various APIs
15 | ///
16 | public string EscapedTitle
17 | => Title.Replace(" ", "_");
18 |
19 | public List Sections { get; set; } = new List();
20 |
21 | public List GetAllImages()
22 | {
23 | var ret = new List();
24 | foreach (var section in Sections)
25 | {
26 | CollectorHelper(section, ret);
27 | }
28 | return ret;
29 | }
30 |
31 | public int GetReferenceCount()
32 | {
33 | int count = 0;
34 | foreach (var section in Sections)
35 | {
36 | count += GetSectionCount(section);
37 | }
38 | return count;
39 | }
40 |
41 | private int GetSectionCount(Section section)
42 | {
43 | int subSectionCount = 0;
44 | foreach (var sub in section.SubSections)
45 | {
46 | subSectionCount += GetSectionCount(sub);
47 | }
48 | return subSectionCount + section.Links.Count;
49 | }
50 |
51 | public Section GetSection(int sectionNum)
52 | {
53 | currSection = 0;
54 | foreach (var sub in Sections)
55 | {
56 | var section = GetSectionHelper(sub, sectionNum);
57 | if (section != null)
58 | {
59 | return section;
60 | }
61 | }
62 | return null;
63 | }
64 |
65 | private Section GetSectionHelper(Section curr, int lookingFor)
66 | {
67 | currSection++;
68 | if (currSection == lookingFor)
69 | {
70 | return curr;
71 | }
72 | if (curr.HasSubSections)
73 | {
74 | foreach (var sub in curr.SubSections)
75 | {
76 | var section = GetSectionHelper(sub, lookingFor);
77 | if (section != null)
78 | {
79 | return section;
80 | }
81 | }
82 | }
83 | return null;
84 | }
85 |
86 | private void CollectorHelper(Section section, List images)
87 | {
88 | images.AddRange(section.GeneralContent.Where(x => x is MediaItem).Select(x => (MediaItem)x));
89 | section.Infoboxes.ForEach(x => images.AddRange(x.MediaItems));
90 | foreach (var subSection in section.SubSections)
91 | {
92 | CollectorHelper(subSection, images);
93 | }
94 | }
95 | }
--------------------------------------------------------------------------------
/Gemipedia.Cgi/CountingTextWriter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Text;
4 |
5 | namespace Gemipedia.Cgi;
6 |
7 | public class CountingTextWriter : TextWriter
8 | {
9 | private readonly TextWriter _innerWriter;
10 | private int _characterCount;
11 | private long _byteCount;
12 |
13 | public CountingTextWriter(TextWriter innerWriter)
14 | {
15 | _innerWriter = innerWriter ?? throw new ArgumentNullException(nameof(innerWriter));
16 | _characterCount = 0;
17 | _byteCount = 0;
18 | }
19 |
20 | public override Encoding Encoding => _innerWriter.Encoding;
21 |
22 | public int CharacterCount => _characterCount;
23 |
24 | public long ByteCount => _byteCount;
25 |
26 | public override void Write(char value)
27 | {
28 | _innerWriter.Write(value);
29 | _characterCount++;
30 | _byteCount += Encoding.GetByteCount(new[] { value });
31 | }
32 |
33 | public override void Write(char[] buffer, int index, int count)
34 | {
35 | _innerWriter.Write(buffer, index, count);
36 | _characterCount += count;
37 | _byteCount += Encoding.GetByteCount(buffer, index, count);
38 | }
39 |
40 | public override void Write(string value)
41 | {
42 | if (value != null)
43 | {
44 | _innerWriter.Write(value);
45 | _characterCount += value.Length;
46 | _byteCount += Encoding.GetByteCount(value);
47 | }
48 | }
49 |
50 | public override void WriteLine()
51 | {
52 | _innerWriter.WriteLine();
53 | _characterCount += Environment.NewLine.Length;
54 | _byteCount += Encoding.GetByteCount(Environment.NewLine);
55 | }
56 |
57 | public override void WriteLine(string value)
58 | {
59 | if (value != null)
60 | {
61 | _innerWriter.WriteLine(value);
62 | _characterCount += value.Length + Environment.NewLine.Length;
63 | _byteCount += Encoding.GetByteCount(value + Environment.NewLine);
64 | }
65 | else
66 | {
67 | WriteLine();
68 | }
69 | }
70 |
71 | public override void WriteLine(char[] buffer, int index, int count)
72 | {
73 | _innerWriter.WriteLine(buffer, index, count);
74 | _characterCount += count + Environment.NewLine.Length;
75 | _byteCount += Encoding.GetByteCount(new string(buffer, index, count) + Environment.NewLine);
76 | }
77 |
78 | public override void WriteLine(char value)
79 | {
80 | Write(value);
81 | WriteLine();
82 | }
83 |
84 | protected override void Dispose(bool disposing)
85 | {
86 | if (disposing)
87 | {
88 | _innerWriter?.Dispose();
89 | }
90 | base.Dispose(disposing);
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Gemipedia
2 | Gemipedia is a [Gemini](https://en.wikipedia.org/wiki/Gemini_(protocol)) frontend to Wikipedia, focused on providing a delightful reading experience. Gemipedia supports accessing all language-specific Wikipedias.
3 |
4 | 
5 | *Rendering main article with navigation suggestions*
6 |
7 | 
8 | *Handling sections, references, and tables*
9 |
10 | 
11 | *Accessing Polish Wikipedia*
12 |
13 | ## Demo
14 | Visit `gemini://gemi.dev/cgi-bin/wp.cgi/` with a [Gemini client](https://github.com/kr1sp1n/awesome-gemini) or [via an HTTP-to-Gemini proxy](https://portal.mozz.us/gemini/gemi.dev/cgi-bin/wp.cgi/)
15 |
16 | ## Features
17 |
18 | ### Content Discovery
19 | * Access all language-specific versions of Wikipedia
20 | * Search results with article descriptions, excerpts, and feature image links to find correct content more quickly
21 | * Supports fuzzy matching for finding articles via "Go to article"
22 | * Featured Content, updated daily, which displaying the article of the day and the 25 most popular stories on Wikipedia from the previous day (same as the front page of Wikipedia)
23 | * "Other Articles" feature. Finds other articles that reference the current article
24 | * "Articles near this article" feature. Finds other articles that are geographic close to the current article
25 | * Supports disambiguation and "see other" navigation links
26 |
27 | ### Usability
28 |
29 | * Groups all the links to additional articles by section, and separate "References" pages for each section
30 | * Gallery View, which pulls all media like images and video out into a separate view
31 | * Images/Media is displayed with intelligently created captions
32 | * Geographic view! Extracts geographic coordinates and provides links to OpenStreetMaps and native Map apps
33 | * Removes superfluous content for Gemini like references, citations, bibliographies, and links to external websites
34 | * Links to source article on Wikipedia over HTTPS
35 | * Caches calls to Wikipedia to speed up viewing sub sections or page refreshing
36 |
37 | ### Rendering
38 | * Supports tables, including cells that span multiple rows or columns, by converting them to ASCII art tables inside of preformatted sections
39 | * Selects high resolution images while still using appropriate size for Gemini/smolweb ethos
40 | * Supports math formulas by displaying them as link lines to PNG images
41 | * Support chemical and physics formulas by converting subscripts and superscript tags into Unicode Subscript/Superscript characters!
42 | * Supports image maps
43 | * Supports Infoxbox rendering
44 | * Supports timeline images and tables
45 | * Supports image galleries
46 | * Add White background to transparent images for better reading on clients with dark mode
47 |
48 | ### Offline Support
49 | * Images are proxied from Wikipedia and rendered with appropriate file extension and MIME type for better offline rendering
50 | * PDF export for offline reading
51 |
--------------------------------------------------------------------------------
/Gemipedia/Models/ArticleLinkCollection.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using AngleSharp.Dom;
4 |
5 | namespace Gemipedia.Models;
6 |
7 | public class ArticleLinkCollection
8 | {
9 | Dictionary articles;
10 |
11 | public ArticleLinkCollection()
12 | {
13 | articles = new Dictionary();
14 | }
15 |
16 | public void Clear()
17 | => articles.Clear();
18 |
19 | public bool HasLinks
20 | => (Count > 0);
21 |
22 | public int Count
23 | => articles.Count;
24 |
25 | public void Add(ArticleLinkCollection collection)
26 | {
27 | foreach (string key in collection.articles.Keys)
28 | {
29 | if (!articles.ContainsKey(key))
30 | {
31 | articles[key] = collection.articles[key];
32 | }
33 | else
34 | {
35 | articles[key].Occurences++;
36 | }
37 | }
38 | }
39 |
40 | public void Add(IArticleLinks itemWithLinks)
41 | => Add(itemWithLinks.Links);
42 |
43 | public void Add(string title)
44 | {
45 | if (string.IsNullOrEmpty(title))
46 | {
47 | return;
48 | }
49 |
50 | var key = title.ToLower();
51 |
52 | if (!articles.ContainsKey(key))
53 | {
54 | articles[key] = new ArticleLink(title);
55 | }
56 | else
57 | {
58 | articles[key].Occurences++;
59 | }
60 | }
61 |
62 | public void Add(IElement element)
63 | {
64 | if (ShouldUseLink(element))
65 | {
66 | Add(RemoveFragment(element.GetAttribute("title")));
67 | }
68 | }
69 |
70 | private string RemoveFragment(string title)
71 | {
72 | var index = title.IndexOf('#');
73 | return index > 0 ? title.Substring(0, index) : title;
74 | }
75 |
76 |
77 |
78 | public List GetLinks()
79 | => articles.Keys.OrderBy(x => x).Select(x => articles[x].Title).ToList();
80 |
81 | public static bool ShouldUseLink(IElement element)
82 | {
83 | //wiki articles have a title attribute
84 | if (!element.HasAttribute("title"))
85 | {
86 | return false;
87 | }
88 | //links to pages that don't exist have a "new" class
89 | if (element.ClassList.Contains("new") || element.ClassList.Contains("internal"))
90 | {
91 | return false;
92 | }
93 | //hyperlinks should be relative, and start with "/wiki/"
94 | if (!(element.GetAttribute("href") ?? "").StartsWith("/wiki/"))
95 | {
96 | return false;
97 | }
98 | //should not be a link a special page
99 | var title = element.GetAttribute("title");
100 | if (title.StartsWith("Special:"))
101 | {
102 | return false;
103 | }
104 | if (title.StartsWith("Help:"))
105 | {
106 | return false;
107 | }
108 |
109 | return true;
110 | }
111 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/SubscriptConverter.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace Gemipedia.Converter.Special;
4 |
5 | public class SubscriptConverter
6 | {
7 |
8 | public string Original { get; private set; }
9 | public string Converted { get; private set; }
10 |
11 | StringBuilder buffer = new StringBuilder();
12 |
13 | public bool IsFullyConverted { get; private set; } = true;
14 |
15 | public bool Convert(string s)
16 | {
17 | Original = s;
18 | Converted = "";
19 |
20 | buffer.Clear();
21 | IsFullyConverted = true;
22 | foreach(char c in s)
23 | {
24 | buffer.Append(ConvertChar(c));
25 | }
26 | Converted = buffer.ToString();
27 | return IsFullyConverted;
28 | }
29 |
30 | public char ConvertChar(char c)
31 | {
32 | switch(c)
33 | {
34 | case '0':
35 | return '\u2080';
36 | case '1':
37 | return '\u2081';
38 | case '2':
39 | return '\u2082';
40 | case '3':
41 | return '\u2083';
42 | case '4':
43 | return '\u2084';
44 | case '5':
45 | return '\u2085';
46 | case '6':
47 | return '\u2086';
48 | case '7':
49 | return '\u2087';
50 | case '8':
51 | return '\u2088';
52 | case '9':
53 | return '\u2089';
54 |
55 | //ASCII plus
56 | case '+':
57 | //small plus sign
58 | case '\uFE62':
59 | //full width plus sign
60 | case '\uFF0B':
61 | return '\u208A';
62 |
63 | //ASCII minus
64 | case '-':
65 | //small hyphen-minus
66 | case '\uFE63':
67 | //full width plus sign
68 | case '\uFF0D':
69 | //minus sign
70 | case '\u2212':
71 | return '\u208B';
72 |
73 | //ASCII equals
74 | case '=':
75 | //small equals sign
76 | case '\uFE66':
77 | //full width equals sign
78 | case '\uFF1D':
79 | return '\u208C';
80 |
81 | case '(':
82 | return '\u208D';
83 | case ')':
84 | return '\u208E';
85 |
86 | //some letters
87 | case 'a':
88 | case 'A':
89 | return '\u2090';
90 | case 'e':
91 | case 'E':
92 | return '\u2091';
93 | case 'h':
94 | case 'H':
95 | return '\u2095';
96 | case 'i':
97 | case 'I':
98 | return '\u1D62';
99 | case 'j':
100 | case 'J':
101 | return '\u2C7C';
102 | case 'k':
103 | case 'K':
104 | return '\u2096';
105 | case 'l':
106 | case 'L':
107 | return '\u2097';
108 | case 'm':
109 | case 'M':
110 | return '\u2098';
111 | case 'n':
112 | case 'N':
113 | return '\u2099';
114 | case 'o':
115 | case 'O':
116 | return '\u2092';
117 | case 'p':
118 | case 'P':
119 | return '\u209A';
120 | case 'r':
121 | case 'R':
122 | return '\u1D63';
123 | case 's':
124 | case 'S':
125 | return '\u209B';
126 | case 't':
127 | case 'T':
128 | return '\u209C';
129 | case 'u':
130 | case 'U':
131 | return '\u1D64';
132 | case 'v':
133 | case 'V':
134 | return '\u1D65';
135 | case 'x':
136 | case 'X':
137 | return '\u2093';
138 |
139 | //greek
140 | case 'β':
141 | return '\u1D66';
142 | case 'γ':
143 | return '\u1D67';
144 | case 'ρ':
145 | return '\u1D68';
146 | case 'φ':
147 | return '\u1D69';
148 | case 'χ':
149 | return '\u1D6A';
150 | }
151 | IsFullyConverted = false;
152 | return c;
153 | }
154 | }
155 |
156 |
--------------------------------------------------------------------------------
/Gemipedia.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.810.19
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia", "Gemipedia\Gemipedia.csproj", "{39080D45-3B8E-421D-8558-24AD7F0E448F}"
7 | EndProject
8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemini.Cgi", "..\Gemini.Cgi\Gemini.Cgi.csproj", "{3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}"
9 | EndProject
10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{519ED394-B5EA-433F-B298-27949E27193F}"
11 | ProjectSection(SolutionItems) = preProject
12 | TODOs.txt = TODOs.txt
13 | Changelog.txt = Changelog.txt
14 | Test Cases.txt = Test Cases.txt
15 | EndProjectSection
16 | EndProject
17 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia.Cgi", "Gemipedia.Cgi\Gemipedia.Cgi.csproj", "{678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}"
18 | EndProject
19 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gemipedia.Console", "Gemipedia.Console\Gemipedia.Console.csproj", "{FF353EC3-647B-40A2-9055-2E96AE127AB2}"
20 | EndProject
21 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CacheComms", "..\CacheComms\CacheComms.csproj", "{88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}"
22 | EndProject
23 | Global
24 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
25 | Debug|Any CPU = Debug|Any CPU
26 | Release|Any CPU = Release|Any CPU
27 | EndGlobalSection
28 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
29 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
30 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Debug|Any CPU.Build.0 = Debug|Any CPU
31 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Release|Any CPU.ActiveCfg = Release|Any CPU
32 | {39080D45-3B8E-421D-8558-24AD7F0E448F}.Release|Any CPU.Build.0 = Release|Any CPU
33 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
34 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Debug|Any CPU.Build.0 = Debug|Any CPU
35 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Release|Any CPU.ActiveCfg = Release|Any CPU
36 | {3BA62C3F-DA9C-48EA-AC3A-DA9C17DB593C}.Release|Any CPU.Build.0 = Release|Any CPU
37 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
38 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
39 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
40 | {678D5030-1D4C-4FF7-ACE3-D5E67DA5F9BA}.Release|Any CPU.Build.0 = Release|Any CPU
41 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
42 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Debug|Any CPU.Build.0 = Debug|Any CPU
43 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Release|Any CPU.ActiveCfg = Release|Any CPU
44 | {FF353EC3-647B-40A2-9055-2E96AE127AB2}.Release|Any CPU.Build.0 = Release|Any CPU
45 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
46 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Debug|Any CPU.Build.0 = Debug|Any CPU
47 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Release|Any CPU.ActiveCfg = Release|Any CPU
48 | {88AB92F5-A6BD-49DD-BCD3-6766FA2A2EC2}.Release|Any CPU.Build.0 = Release|Any CPU
49 | EndGlobalSection
50 | GlobalSection(SolutionProperties) = preSolution
51 | HideSolutionNode = FALSE
52 | EndGlobalSection
53 | GlobalSection(ExtensibilityGlobals) = postSolution
54 | SolutionGuid = {41FF9B16-4FB0-4875-A3D2-7D214963C613}
55 | EndGlobalSection
56 | EndGlobal
57 |
--------------------------------------------------------------------------------
/Gemipedia.Console/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Threading;
4 | using Gemipedia.API;
5 | using Gemipedia.API.Models;
6 | using Gemipedia.Converter;
7 | using Gemipedia.Converter.Special;
8 | using Gemipedia.Models;
9 | using Gemipedia.Renderer;
10 |
11 | namespace Gemipedia.Console;
12 |
13 | class Program
14 | {
15 | static ThreadSafeCounter counter = new ThreadSafeCounter();
16 |
17 | static void Main(string[] args)
18 | {
19 | //StressTest();
20 |
21 | do
22 | {
23 | System.Console.WriteLine("Article?");
24 | string name = System.Console.ReadLine();
25 | if (name == "")
26 | {
27 | return;
28 | }
29 | var article = GetArticle(name);
30 | if (article != null)
31 | {
32 | var newConverter = new WikiHtmlConverter();
33 |
34 | ParsedPage page = newConverter.Convert(article.Title, article.HtmlText);
35 |
36 | var renderer = new ArticleRenderer();
37 | renderer.RenderArticle(page, System.Console.Out);
38 | }
39 | else
40 | {
41 | System.Console.WriteLine("error fetching article");
42 | }
43 |
44 | } while (true);
45 | }
46 |
47 | static void StressTest()
48 | {
49 | int workers = 20;
50 | for (int i = 0; i < workers; i++)
51 | {
52 | var thread = new Thread(new ThreadStart(DoStressWork));
53 | thread.Start();
54 | }
55 |
56 | while (true)
57 | {
58 | Thread.Sleep(30000);
59 | }
60 | }
61 |
62 | static void DoStressWork()
63 | {
64 | var converter = new WikiHtmlConverter();
65 |
66 | var client = new WikipediaApiClient(UserOptions.WikipediaVersion);
67 |
68 | while (true)
69 | {
70 | var count = counter.Increment();
71 | var title = client.GetRandomArticleTitle();
72 |
73 | try
74 | {
75 | var article = GetArticle(title);
76 | System.Console.WriteLine($"{count}\t{title}");
77 | ParsedPage page = converter.Convert(article.Title, article.HtmlText);
78 |
79 | StringWriter fout = new StringWriter();
80 | var renderer = new ArticleRenderer();
81 | renderer.RenderArticle(page, fout);
82 | }
83 | catch (Exception ex)
84 | {
85 | System.IO.File.AppendAllText("/Users/billy/tmp/ERRORS.txt", $"\"{title}\"\t{ex.Message} - {ex.Source}\n==={ex.StackTrace}===");
86 | }
87 | System.Threading.Thread.Sleep(100);
88 | }
89 | }
90 |
91 | static Article GetArticle(string title)
92 | {
93 |
94 | var client = new WikipediaApiClient(UserOptions.WikipediaVersion);
95 | Article ret;
96 |
97 | bool gotArticle = true;
98 | do
99 | {
100 | gotArticle = true;
101 | ret = client.GetArticle(title);
102 | if (ret == null)
103 | {
104 | return ret;
105 | }
106 |
107 | if (RedirectParser.IsArticleRedirect(ret.HtmlText))
108 | {
109 | gotArticle = false;
110 | title = RedirectParser.GetRedirectTitle(ret.HtmlText);
111 | }
112 | } while (!gotArticle);
113 |
114 | return ret;
115 | }
116 | }
--------------------------------------------------------------------------------
/Gemipedia/Renderer/ReferencesRenderer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Linq;
4 | using System.Net;
5 | using Gemipedia.Models;
6 |
7 | namespace Gemipedia.Renderer;
8 |
9 | public class ReferencesRenderer
10 | {
11 | TextWriter Writer;
12 | ParsedPage Page;
13 |
14 | public void RenderReferences(ParsedPage parsedPage, TextWriter writer, int section)
15 | {
16 | Writer = writer;
17 | Page = parsedPage;
18 |
19 | if (section > 0)
20 | {
21 | RenderSectionReferences(section);
22 | }
23 | else
24 | {
25 | RenderAllReferences();
26 | }
27 | }
28 |
29 | private void RenderSectionReferences(int sectionNum)
30 | {
31 |
32 | var section = Page.GetSection(sectionNum);
33 | if (section != null)
34 | {
35 | var title = SectionName(section);
36 |
37 | Writer.WriteLine($"# References for {Page.Title}: {title}");
38 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article");
39 | Writer.WriteLine($"=> {RouteOptions.ReferencesUrl(Page.Title)} See all references for article");
40 | Writer.WriteLine();
41 | Writer.WriteLine($"References to other articles in the '{title}' section");
42 | foreach (var linkTitle in section.Links.GetLinks())
43 | {
44 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}");
45 | }
46 | }
47 | Writer.WriteLine();
48 | Writer.WriteLine($"=> https://en.wikipedia.org/wiki/{WebUtility.UrlEncode(Page.Title)} Source on Wikipedia");
49 | }
50 |
51 | private string SectionName(Section section)
52 | => section.IsSpecial ? "Summary Section" : section.Title;
53 |
54 | private void RenderAllReferences()
55 | {
56 | Writer.WriteLine($"# References for {Page.Title}");
57 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(Page.Title)} Back to article");
58 | Writer.WriteLine();
59 | Writer.WriteLine("References to other articles, organized by section");
60 | foreach (var subSection in Page.Sections.Where(x => !ShouldExcludeSectionIndex(x)))
61 | {
62 | RenderIndexForSection(subSection);
63 | }
64 | Writer.WriteLine();
65 | Writer.WriteLine($"=> https://en.wikipedia.org/wiki/{WebUtility.UrlEncode(Page.Title)} Source on Wikipedia");
66 | }
67 |
68 | private void RenderIndexForSection(Section section)
69 | {
70 | //only display the section title if this section has links
71 | if (HasLinks(section))
72 | {
73 | if (!section.IsSpecial)
74 | {
75 | if (section.SectionDepth == 2)
76 | {
77 | Writer.WriteLine($"## {section.Title}");
78 | }
79 | else
80 | {
81 | //all other sections are at a level 3
82 | Writer.WriteLine($"### {section.Title}");
83 | }
84 | }
85 | foreach (var linkTitle in section.Links.GetLinks())
86 | {
87 | Writer.WriteLine($"=> {RouteOptions.ArticleUrl(linkTitle)} {linkTitle}");
88 | }
89 | }
90 | if (section.HasSubSections)
91 | {
92 | foreach (var subSection in section.SubSections.Where(x => !ShouldExcludeSectionIndex(x)))
93 | {
94 | RenderIndexForSection(subSection);
95 | }
96 | }
97 | }
98 |
99 | //do we have any links which have no already been rendered?
100 | private bool HasLinks(Section section)
101 | => section.Links.HasLinks;
102 |
103 | private bool ShouldExcludeSectionIndex(Section section)
104 | => UserOptions.ArticleLinkSections.Contains(section.Title?.ToLower());
105 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Filter/DomFilter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using AngleSharp.Html.Dom;
4 |
5 | namespace Gemipedia.Converter.Filter;
6 |
7 | ///
8 | /// removed DOM objects that match certain rules
9 | ///
10 | public class DomFilter
11 | {
12 | public static DomFilter Global = new DomFilter();
13 |
14 | Dictionary> TagFilters;
15 |
16 | List JustClassRules;
17 |
18 | List JustIDs;
19 |
20 | public DomFilter()
21 | {
22 | TagFilters = new Dictionary>();
23 | JustClassRules = new List();
24 | JustIDs = new List();
25 | }
26 |
27 | public bool IsElementAllowed(HtmlElement element, string normalizedTagName)
28 | {
29 | //check for tag-specific rules
30 | if (TagFilters.ContainsKey(normalizedTagName))
31 | {
32 | foreach (var rule in TagFilters[normalizedTagName])
33 | {
34 | if (rule.HasClass)
35 | {
36 | if (element.ClassList.Contains(rule.ClassName))
37 | {
38 | return false;
39 | }
40 | }
41 | else if (rule.HasID)
42 | {
43 | if ((element.Id ?? "") == rule.ID)
44 | {
45 | return false;
46 | }
47 | }
48 | }
49 | }
50 |
51 | if (element.ClassList.Length > 0)
52 | {
53 | foreach (var rule in JustClassRules)
54 | {
55 | if (element.ClassList.Contains(rule.ClassName))
56 | {
57 | return false;
58 | }
59 | }
60 | }
61 | if (!string.IsNullOrEmpty(element.Id))
62 | {
63 | foreach (var rule in JustIDs)
64 | {
65 | if (element.Id == rule.ID)
66 | {
67 | return false;
68 | }
69 | }
70 | }
71 | return true;
72 | }
73 |
74 | public void AddRule(string selector)
75 | {
76 | string tag = "";
77 | string cls = "";
78 | string id = "";
79 |
80 | if (selector.Contains("."))
81 | {
82 | tag = ClipBefore(selector, ".").ToLower();
83 | cls = ClipAfter(selector, ".");
84 | }
85 | else if (selector.Contains("#"))
86 | {
87 | tag = ClipBefore(selector, "#").ToLower();
88 | id = ClipAfter(selector, "#");
89 | }
90 | else
91 | {
92 | tag = selector.ToLower();
93 | }
94 |
95 | var rule = new FilterRule
96 | {
97 | TagName = tag,
98 | ClassName = cls,
99 | ID = id
100 | };
101 |
102 | if (rule.HasTag)
103 | {
104 | if (!TagFilters.ContainsKey(rule.TagName))
105 | {
106 | TagFilters[rule.TagName] = new List();
107 | }
108 | TagFilters[rule.TagName].Add(rule);
109 | }
110 | else if (rule.HasClass)
111 | {
112 | JustClassRules.Add(rule);
113 | }
114 | else if (rule.HasID)
115 | {
116 | JustIDs.Add(rule);
117 | }
118 | }
119 |
120 | private string ClipAfter(string s, string c)
121 | {
122 | int x = s.IndexOf(c);
123 | if (x >= 0 && x + 1 != s.Length)
124 | {
125 | return s.Substring(x + 1);
126 | }
127 | return string.Empty;
128 | }
129 |
130 | private string ClipBefore(string s, string c)
131 | {
132 | int x = s.IndexOf(c);
133 | return x > 0 ? s.Substring(0, x) : String.Empty;
134 | }
135 | }
--------------------------------------------------------------------------------
/Changelog.txt:
--------------------------------------------------------------------------------
1 | Changelog:
2 | - Feature: Organize referenced links by section (+ don't show duplicates of links that appear in more than 1 section)
3 | - Feature: separate image gallery for article showing all the media
4 | - Feature: Don't show a section if it doesn't have any content
5 | - Feature: Render tables as ASCII tables
6 | - Feature: Support math elements (SVG images transcoded on the fly as PNG)
7 | - Feature: Use the 2x image for better resolution
8 | - Feature: Include video links in Image gallery
9 | - Feature: Show an image for video media using poster attribute
10 | - Fix: Better support for nested tables
11 | - Fix: Skip audio pronunciation links
12 | - Fix: Skip pronunciation helper links
13 | - Create separate page for reference links, include specific link for each section to show links for that section
14 | - Feature: show count for reference links per section, and for all references
15 | - Feature: support SUP tag
16 | - Fix: remove meta data text (e.g. "Citation needed" and "original research" text/links)
17 | - Fix: Only display 1 format for geo coordinates
18 | - Feature: PDF for article
19 | - Fix: Link to original article on Wikipedia (needed underscore escaping)
20 | - Fix Table padding bug (Zero Width space characters!) - German submarine U-48 (1939)
21 | - Fix: Tables don't support Row spans
22 | - Fix: Collect reference links from table captions
23 | - Fix: Navigation suggestions not rendered if more than 1 hyperlink in a sentence
24 | - Fix: attempt to support tables with malformed colspan/rowspan values (iPod article)
25 | - Fix: properly render tables with incorrect/excessive colspan values (iPod article)
26 | - Feature: Support timelines (extract the image, properly extract article links from MAP tag)
27 | - Fix: exclude navigation sections from section's references
28 | - Fix: exclude fragement from article title when collecting references
29 | - Feature: Speed up views with Disk cache for Wikipedia content
30 | (especially important for sub-pages like gallery, references, since that needs would refetch article JSON)
31 | - Feature: Sharper looking math formulas by referencing Wikipedia's PNGs directly
32 | - Feature: Add White background to transparent images for better reading on clients with dark mode
33 | - Feature: Serve media with proper extension, mimetype, for better downloading (easier to tell if something is an animated GIF, etc)
34 | - Fix: Newlines in captions (Project Gutenberg article)
35 | - Feature: support image maps on all images, not just timelines (Broadway theatre article)
36 | - Fix: complete parser rewrite that supports more content without special case handling, and fixes rendering errors
37 | - Feature: Includes search snippet in search results
38 | - Feature: Use the 1.5x image if 2x image is not available
39 | - Feature: Support for Image Galleries (images would show up using generic media finder, but wasn't getting the appropriate caption)
40 | - Feature: Add links to search for other pages about an article
41 | - Feature: Added Featured Content view from front page of Wikipedia. Contains Featured Article and most popular pages
42 | - Feature: Geographic view! Extracts coordinates and provides links to OpenStreetMaps and native Map apps
43 | - Feature: Find articles near another article
44 | - Feature: Support chemical and physics formulas by converting subscripts and superscript tags into Unicode Subscript/Superscript characters!
45 | - Feature: Support side-by-side comparisons in Infoboxes (basicaly any article about a conflict, .e.g. World War II)
46 | - Fix: Better handling of nested tables in Infoboxes
47 | - Fix: Ignore links to Wikidata
48 | - Feature: Support multiple Wikipedia languages
49 | - Fix: Handle empty infoboxes
50 | - Fix: Handle empty rows/empty tables
51 | - Fix: handle malformed colspans
52 | - Fix: handle incorrect nested headers (H1 in output)
53 | - Fix: handle geo coordinates with missing levels of precision
54 | - Feature: Added support for Simple English Wikipedia
55 | - Fix: Added support for new media HTML structure. See: https://www.mediawiki.org/wiki/Parsoid/Parser_Unification/Media_structure/FAQ
56 | - Fix: Better detection of geohack URLs allowed for more links to be displayed
57 | - Feature: Better captions on galleries and montages
58 | - Fix: Crashing on wide unicode in tables
59 | - Feature: Added download and convert times to footer
60 | - Feature: Added original and converted size to footer
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/SuperscriptConverter.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 |
3 | namespace Gemipedia.Converter.Special;
4 |
5 | public class SuperscriptConverter
6 | {
7 |
8 | public string Original { get; private set; }
9 | public string Converted { get; private set; }
10 |
11 | StringBuilder buffer = new StringBuilder();
12 |
13 | public bool IsFullyConverted { get; private set; } = true;
14 |
15 | public bool Convert(string s)
16 | {
17 | Original = s;
18 | Converted = "";
19 |
20 | buffer.Clear();
21 | IsFullyConverted = true;
22 | foreach(char c in s)
23 | {
24 | buffer.Append(ConvertChar(c));
25 | if(!IsFullyConverted)
26 | {
27 | return false;
28 | }
29 | }
30 | Converted = buffer.ToString();
31 | return IsFullyConverted;
32 | }
33 |
34 | public char ConvertChar(char c)
35 | {
36 | switch(c)
37 | {
38 | case '0':
39 | return '\u2070';
40 | case '1':
41 | return '\u00B9';
42 | case '2':
43 | return '\u00B2';
44 | case '3':
45 | return '\u00B3';
46 | case '4':
47 | return '\u2074';
48 | case '5':
49 | return '\u2075';
50 | case '6':
51 | return '\u2076';
52 | case '7':
53 | return '\u2077';
54 | case '8':
55 | return '\u2078';
56 | case '9':
57 | return '\u2079';
58 |
59 | //ASCII plus
60 | case '+':
61 | //small plus sign
62 | case '\uFE62':
63 | //full width plus sign
64 | case '\uFF0B':
65 | return '\u207A';
66 |
67 | //ASCII minus
68 | case '-':
69 | //small hyphen-minus
70 | case '\uFE63':
71 | //full width plus sign
72 | case '\uFF0D':
73 | //minus sign
74 | case '\u2212':
75 | return '\u207B';
76 |
77 | //ASCII equals
78 | case '=':
79 | //small equals sign
80 | case '\uFE66':
81 | //full width equals sign
82 | case '\uFF1D':
83 | return '\u207C';
84 |
85 | case '(':
86 | return '\u207D';
87 | case ')':
88 | return '\u207E';
89 |
90 | //Lowercase
91 | case 'a':
92 | return '\u1D43';
93 | case 'b':
94 | return '\u1D47';
95 | case 'c':
96 | return '\u1D9C';
97 | case 'd':
98 | return '\u1D48';
99 | case 'e':
100 | return '\u1D49';
101 | case 'f':
102 | return '\u1DA0';
103 | case 'g':
104 | return '\u1D4D';
105 | case 'h':
106 | return '\u02B0';
107 | case 'i':
108 | return '\u2071';
109 | case 'j':
110 | return '\u02B2';
111 | case 'k':
112 | return '\u1D4F';
113 | case 'l':
114 | return '\u02E1';
115 | case 'm':
116 | return '\u1D50';
117 | case 'n':
118 | return '\u207F';
119 | case 'o':
120 | return '\u1D52';
121 | case 'p':
122 | return '\u1D56';
123 | // there is no widely support Q subscript
124 | //case 'q':
125 | case 'r':
126 | return '\u02B3';
127 | case 's':
128 | return '\u02E2';
129 | case 't':
130 | return '\u1D57';
131 | case 'u':
132 | return '\u1D58';
133 | case 'v':
134 | return '\u1D5B';
135 | case 'w':
136 | return '\u02B7';
137 | case 'x':
138 | return '\u02E3';
139 | case 'y':
140 | return '\u02B8';
141 | case 'z':
142 | return '\u1DBB';
143 |
144 | //uppercase
145 | case 'A':
146 | return '\u1D2C';
147 | case 'B':
148 | return '\u1D2E';
149 | case 'D':
150 | return '\u1D30';
151 | case 'E':
152 | return '\u1D31';
153 | case 'G':
154 | return '\u1D33';
155 | case 'H':
156 | return '\u1D34';
157 | case 'I':
158 | return '\u1D35';
159 | case 'J':
160 | return '\u1D36';
161 | case 'K':
162 | return '\u1D37';
163 | case 'L':
164 | return '\u1D38';
165 | case 'M':
166 | return '\u1D39';
167 | case 'N':
168 | return '\u1D3A';
169 | case 'O':
170 | return '\u1D3C';
171 | case 'P':
172 | return '\u1D3E';
173 | case 'R':
174 | return '\u1D3F';
175 | case 'T':
176 | return '\u1D40';
177 | case 'U':
178 | return '\u1D41';
179 | case 'V':
180 | return '\u2C7D';
181 | case 'W':
182 | return '\u1D42';
183 |
184 | //greek
185 | case 'α':
186 | return '\u1D45';
187 | case 'β':
188 | return '\u1D5D';
189 | case 'γ':
190 | return '\u1D5E';
191 | case 'δ':
192 | return '\u1D5F';
193 | case '∊':
194 | return '\u1D4B';
195 | case 'θ':
196 | return '\u1DBF';
197 | case 'ι':
198 | return '\u1DA5';
199 | case 'Φ':
200 | return '\u1DB2';
201 | case 'φ':
202 | return '\u1D60';
203 | case 'χ':
204 | return '\u1D61';
205 | }
206 | IsFullyConverted = false;
207 | return c;
208 | }
209 | }
210 |
211 |
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/TextExtractor.cs:
--------------------------------------------------------------------------------
1 | using System.Linq;
2 | using System.Text.RegularExpressions;
3 | using AngleSharp.Dom;
4 | using AngleSharp.Html.Dom;
5 | using Gemipedia.Models;
6 |
7 | namespace Gemipedia.Converter.Special;
8 |
9 | ///
10 | /// Extracts text
11 | ///
12 | public class TextExtractor : ITextContent
13 | {
14 | public string Content
15 | => ShouldCollapseNewlines ?
16 | CollapseNewlines(buffer.Content) :
17 | buffer.Content;
18 |
19 | public ArticleLinkCollection Links
20 | => buffer.Links;
21 |
22 | public bool ShouldCollapseNewlines { get; set; } = false;
23 | public bool ShouldConvertImages { get; set; } = false;
24 |
25 | //sets the character we use for newline replacement
26 | public string NewlineReplacement { get; set; } = " ";
27 |
28 | private static readonly Regex whitespace = new Regex(@"\s+", RegexOptions.Compiled);
29 |
30 | private Buffer buffer = new Buffer();
31 |
32 | public void Extract(params INode[] nodes)
33 | => Extract(nodes.Where(x => x != null).FirstOrDefault());
34 |
35 | public void Extract(INode current)
36 | {
37 | buffer.Reset();
38 | if (current == null)
39 | {
40 | //nothing to do
41 | return;
42 | }
43 | ExtractInnerTextHelper(current);
44 | }
45 |
46 | private void ExtractInnerTextHelper(INode current)
47 | {
48 | switch (current.NodeType)
49 | {
50 | case NodeType.Text:
51 | //if its not only whitespace add it.
52 | if (current.TextContent.Trim().Length > 0)
53 | {
54 | buffer.Append(current.TextContent);
55 | }
56 | //if its whitepsace, but doesn't have a newline
57 | else if (!current.TextContent.Contains('\n'))
58 | {
59 | buffer.Append(current.TextContent);
60 | }
61 | break;
62 |
63 | case NodeType.Element:
64 | {
65 | HtmlElement element = current as HtmlElement;
66 | var nodeName = element?.NodeName.ToLower();
67 |
68 | if (!HtmlParser.ShouldProcessElement(element, nodeName))
69 | {
70 | return;
71 | }
72 |
73 | switch (nodeName)
74 | {
75 | case "a":
76 | Links.Add(element);
77 | ExtractChildrenText(current);
78 | break;
79 |
80 | case "br":
81 | buffer.AppendLine();
82 | break;
83 |
84 | case "img":
85 | if (ShouldConvertImages)
86 | {
87 | buffer.Append(ConvertImage(element));
88 | }
89 | break;
90 |
91 | default:
92 | if (HtmlParser.ShouldDisplayAsBlock(element))
93 | {
94 | buffer.EnsureAtLineStart();
95 | ExtractChildrenText(current);
96 | buffer.EnsureAtLineStart();
97 | }
98 | else
99 | {
100 | ExtractChildrenText(current);
101 | }
102 | break;
103 | }
104 | }
105 | break;
106 | }
107 | }
108 |
109 | private void ExtractChildrenText(INode element)
110 | => element.ChildNodes.ToList().ForEach(x => ExtractInnerTextHelper(x));
111 |
112 | //converts newlines to spaces. since that can create runs of whitespace,
113 | //remove those is they exist
114 | private string CollapseNewlines(string s)
115 | => CollapseSpaces(ConvertNewlines(s));
116 |
117 | private string ConvertNewlines(string s)
118 | => s.Replace("\n", NewlineReplacement).Trim();
119 |
120 | private string CollapseSpaces(string s)
121 | => whitespace.Replace(s, " ");
122 |
123 | private string ConvertImage(HtmlElement element)
124 | {
125 | var alt = element.GetAttribute("alt");
126 | if (string.IsNullOrEmpty(alt))
127 | {
128 | alt = element.GetAttribute("title");
129 | }
130 | return !string.IsNullOrEmpty(alt) ?
131 | $"[Image: {alt}] " :
132 | "";
133 | }
134 | }
--------------------------------------------------------------------------------
/Gemipedia/API/WikipediaApiClient.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net;
4 | using CacheComms;
5 | using Gemipedia.API.Models;
6 |
7 | namespace Gemipedia.API;
8 |
9 | ///
10 | /// Wikipedia API client. Contacts the API and gets model objects back
11 | ///
12 | public class WikipediaApiClient
13 | {
14 | HttpRequestor Requestor;
15 | string Language;
16 |
17 | public long DownloadTimeMs => Requestor.DownloadTimeMs;
18 |
19 | public int DownloadSize => Requestor.BodySize ?? 0;
20 |
21 | public WikipediaApiClient(string lang = "en")
22 | {
23 | Requestor = new HttpRequestor();
24 | Language = lang;
25 | }
26 |
27 | public List GeoSearch(double lat, double lon)
28 | {
29 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=query&format=json&list=geosearch&gscoord={lat}%7C{lon}&gsradius=5000&gslimit=100");
30 | string json = FetchString(url);
31 | return ResponseParser.ParseGeoSearch(json);
32 | }
33 |
34 | //Gets the title of a random article
35 | public string GetRandomArticleTitle()
36 | {
37 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit=1");
38 | string json = FetchString(url, false);
39 | return ResponseParser.ParseRandomArticle(json);
40 | }
41 |
42 | ///
43 | /// Gets an article
44 | ///
45 | ///
46 | ///
47 | public Article GetArticle(string title)
48 | {
49 | var url = new Uri($"https://{Language}.wikipedia.org/w/api.php?action=parse&page={WebUtility.UrlEncode(title)}&prop=text&format=json");
50 | string json = FetchString(url);
51 | return ResponseParser.ParseArticleResponse(json);
52 | }
53 |
54 | public FeaturedContent GetFeaturedContent()
55 | {
56 | //if you fetch the most popular content early in the day, there aren't any popular articles
57 | var url = new Uri($"https://{Language}.wikipedia.org/api/rest_v1/feed/featured/{DateTime.Now.ToString("yyyy/MM/dd")}");
58 | //don't use the cace for this
59 | string json = FetchString(url);
60 | var featured = ResponseParser.ParseFeaturedContentResponse(json);
61 |
62 | if (featured.PopularArticles.Count == 0)
63 | {
64 | //fetch yesterdays
65 | var yesterday = DateTime.Now.Subtract(new TimeSpan(24, 0, 0));
66 | //fetch yesterdays most popular articles
67 | url = new Uri($"https://{Language}.wikipedia.org/api/rest_v1/feed/featured/{yesterday.ToString("yyyy/MM/dd")}");
68 | var oldFeatured = ResponseParser.ParseFeaturedContentResponse(FetchString(url));
69 | featured.PopularArticles = oldFeatured.PopularArticles;
70 | }
71 |
72 | return featured;
73 | }
74 |
75 | public List GetOtherLanguages(string title)
76 | {
77 | //API wants whitespace encoded as underscores
78 | title = title.Replace(" ", "_");
79 | var url = new Uri($"https://{Language}.wikipedia.org/w/rest.php/v1/page/{WebUtility.UrlEncode(title)}/links/language");
80 | string json = FetchString(url);
81 | return ResponseParser.ParseOtherLanguagesResponse(json);
82 | }
83 |
84 | ///
85 | /// Performance a search using the "rest.php/v1/search/page" endpoint
86 | ///
87 | ///
88 | ///
89 | public List Search(string query)
90 | {
91 | var url = new Uri($"https://{Language}.wikipedia.org/w/rest.php/v1/search/page?q={WebUtility.UrlEncode(query)}&limit=25");
92 | string json = FetchString(url);
93 | return ResponseParser.ParseSearchResponse(json);
94 | }
95 |
96 | //gets an image
97 | public byte[] GetMedia(string url)
98 | => FetchBytes(url);
99 |
100 | //Downloads a string, if its not already cached
101 | private string FetchString(Uri url, bool useCache = true)
102 | {
103 | var result = Requestor.GetAsString(url, useCache);
104 | if (!result)
105 | {
106 | return "";
107 | }
108 | return Requestor.BodyText;
109 | }
110 |
111 | ///
112 | /// Fetchs the bytes for a URL. If it exists in the cache, it gets pulled
113 | /// otherwise a network request happens, and the results are cached
114 | ///
115 | ///
116 | ///
117 | private byte[] FetchBytes(string url, bool useCache = true)
118 | {
119 | var result = Requestor.GetAsBytes(new Uri(url), useCache);
120 | if (!result)
121 | {
122 | return null;
123 | }
124 | return Requestor.BodyBytes;
125 | }
126 | }
--------------------------------------------------------------------------------
/Gemipedia/RouteOptions.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Net;
4 |
5 | namespace Gemipedia;
6 |
7 | public static class RouteOptions
8 | {
9 | #region base URLs
10 |
11 | ///
12 | /// Base URL to use to view an article. Actual artical passed via query string
13 | ///
14 | public static string BaseArticleUrl { get; set; }
15 |
16 | public static string BaseFeaturedContenteUrl { get; set; }
17 |
18 | ///
19 | /// BaseURL to use to view geographic data.
20 | ///
21 | public static string BaseGeoUrl { get; set; }
22 |
23 | public static string BaseImageGallerUrl { get; set; }
24 |
25 | public static string BaseLanguageUrl { get; set; }
26 |
27 | public static string BaseLonLatUrl { get; set; }
28 | ///
29 | /// URL to use to proxy media. actual media path passed via query string
30 | ///
31 | public static string BaseMediaProxyUrl { get; set; }
32 |
33 | public static string BaseOtherLanguagesUrl { get; set; }
34 |
35 | public static string BaseRandomArticleUrl { get; set; }
36 |
37 | public static string BaseReferencesUrl { get; set; }
38 |
39 | public static string BaseSearchUrl { get; set; }
40 |
41 | public static string BaseSetLanguageUrl { get; set; }
42 |
43 | public static string BaseWelcomeUrl { get; set; }
44 |
45 | #endregion
46 |
47 | public static string ArticleUrl()
48 | => $"{AddLanguage(BaseArticleUrl)}";
49 |
50 | public static string ArticleUrl(string title)
51 | => $"{AddLanguage(BaseArticleUrl)}?{WebUtility.UrlEncode(title)}";
52 |
53 | public static string ArticleUrl(string title, string forceInLang)
54 | => $"{BaseArticleUrl}/{forceInLang}?{WebUtility.UrlEncode(title)}";
55 |
56 | public static string FeaturedContent()
57 | => $"{AddLanguage(BaseFeaturedContenteUrl)}";
58 |
59 | public static string GeoUrl(string geohackUrl)
60 | => $"{AddLanguage(BaseGeoUrl)}?{WebUtility.UrlEncode(geohackUrl)}";
61 |
62 | public static string ImageGalleryUrl(string title)
63 | => $"{AddLanguage(BaseImageGallerUrl)}?{WebUtility.UrlEncode(title)}";
64 |
65 | public static string LonLatUrl(double latitude, double longitude, string articleTitle)
66 | => $"{AddLanguage(BaseLonLatUrl)}?lat={latitude}&lon={longitude}&title={WebUtility.UrlEncode(articleTitle)}";
67 |
68 | public static string MediaProxyUrl(string url)
69 | {
70 | //we need to have an extension on the filename of the media proxy URL, so clients
71 | //will render it as an inline image. Try and figure out what to use, but fall back
72 | //to a dummy "jpg" if nothing works
73 | string ext = ".jpg";
74 | try
75 | {
76 | var uri = new Uri(url);
77 | ext = Path.GetExtension(uri.AbsolutePath);
78 | ext = String.IsNullOrEmpty(ext) ? ".jpg" : ext;
79 | }
80 | catch (Exception)
81 | {
82 | ext = ".jpg";
83 | }
84 | return $"{BaseMediaProxyUrl}{ext}?{WebUtility.UrlEncode(url)}";
85 | }
86 |
87 | public static string OtherLanguagesUrl(string title)
88 | => $"{AddLanguage(BaseOtherLanguagesUrl)}?{WebUtility.UrlEncode(title)}";
89 |
90 | public static string PdfUrl(string escapedTitle)
91 | => $"https://{UserOptions.WikipediaVersion}.wikipedia.org/api/rest_v1/page/pdf/{WebUtility.UrlEncode(escapedTitle)}";
92 |
93 | public static string RandomArticleUrl()
94 | => $"{AddLanguage(BaseRandomArticleUrl)}";
95 |
96 | public static string ReferencesUrl(string title)
97 | => $"{AddLanguage(BaseReferencesUrl)}?name={WebUtility.UrlEncode(title)}";
98 |
99 | public static string ReferencesUrl(string title, int sectionNum)
100 | => $"{AddLanguage(BaseReferencesUrl)}?name={WebUtility.UrlEncode(title)}§ion={sectionNum}";
101 |
102 | public static string SearchUrl()
103 | => $"{AddLanguage(BaseSearchUrl)}";
104 |
105 | public static string SearchUrl(string query)
106 | => $"{AddLanguage(BaseSearchUrl)}?{WebUtility.UrlEncode(query)}";
107 |
108 | public static string SelectLanguageUrl()
109 | => $"{AddLanguage(BaseLanguageUrl)}";
110 |
111 | public static string SetLanguageUrl()
112 | => BaseSetLanguageUrl;
113 |
114 | public static string WelcomeUrl()
115 | => $"{AddLanguage(BaseWelcomeUrl)}";
116 |
117 | public static string WelcomeUrl(string forceLang)
118 | => $"{BaseWelcomeUrl}/{forceLang}";
119 |
120 | public static string WikipediaSourceUrl(string escapedTitle)
121 | => $"https://{UserOptions.WikipediaVersion}.wikipedia.org/wiki/{WebUtility.UrlEncode(escapedTitle)}";
122 |
123 | //if we can help it, avoid adding a language, since it increases the size of the URL
124 | //which can cause problems if we have to proxy something long
125 | private static string AddLanguage(string url)
126 | => (UserOptions.WikipediaVersion == "en") ? url : url + '/' + UserOptions.WikipediaVersion;
127 | }
--------------------------------------------------------------------------------
/Gemipedia/API/ResponseParser.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Net;
5 | using System.Text.RegularExpressions;
6 | using Gemipedia.API.Models;
7 | using Newtonsoft.Json.Linq;
8 |
9 | namespace Gemipedia.API;
10 |
11 | ///
12 | /// Parses the JSON responses of the Wikipedia API into model objects
13 | ///
14 | public static class ResponseParser
15 | {
16 | public static Article ParseArticleResponse(string json)
17 | {
18 | var response = ParseJson(json);
19 |
20 | if (response["error"] != null)
21 | {
22 | //error loading page!
23 | return null;
24 | }
25 |
26 | return new Article
27 | {
28 | Title = Cleanse(response["parse"]["title"]),
29 | PageId = Convert.ToInt64(Cleanse(response["parse"]["pageid"])),
30 | HtmlText = Cleanse(response["parse"]["text"]["*"]),
31 | };
32 | }
33 |
34 | public static List ParseGeoSearch(string json)
35 | {
36 | var response = ParseJson(json);
37 | List ret = new List();
38 |
39 | if (response["query"] != null && response["query"]["geosearch"] != null)
40 | {
41 | //skip the first since that's the article we are on
42 |
43 | foreach (JObject result in (response["query"]["geosearch"] as JArray).Skip(1))
44 | {
45 | ret.Add(new ArticleSummary
46 | {
47 | Title = Cleanse(result["title"]),
48 | Distance = (int)Math.Round(Convert.ToDouble(result["dist"]?.ToString() ?? "0"))
49 | });
50 | }
51 | }
52 |
53 | return ret;
54 | }
55 |
56 | public static List ParseSearchResponse(string json)
57 | {
58 | var response = ParseJson(json);
59 | List ret = new List();
60 | foreach (JObject result in (response["pages"] as JArray))
61 | {
62 | ret.Add(new ArticleSummary
63 | {
64 | Title = StripNewlines(Cleanse(result["title"])),
65 | Excerpt = StripNewlines(StripHtml(Cleanse(result["excerpt"]))),
66 | Description = StripNewlines(Cleanse(result["description"])),
67 | ThumbnailUrl = GetThumbnailUrl(result["thumbnail"] as JObject)
68 | });
69 | }
70 | return ret;
71 | }
72 |
73 | public static List ParseOtherLanguagesResponse(string json)
74 | {
75 | var response = JArray.Parse(json);
76 | List ret = new List();
77 | foreach (JObject result in response)
78 | {
79 | ret.Add(new ArticleSummary
80 | {
81 | Title = Cleanse(result["title"]),
82 | LanguageCode = Cleanse(result["code"])
83 | });
84 | }
85 | return ret;
86 | }
87 |
88 | public static FeaturedContent ParseFeaturedContentResponse(string json)
89 | {
90 | var response = ParseJson(json);
91 | return new FeaturedContent
92 | {
93 | FeaturedArticle = ParseArticleSummary(response["tfa"] as JObject),
94 | PopularArticles = ParsePopularArticles(response["mostread"] as JObject)
95 | };
96 | }
97 |
98 | public static string ParseRandomArticle(string json)
99 | {
100 | var response = ParseJson(json);
101 | return response["query"]["random"][0]["title"].Value();
102 | }
103 |
104 | private static List ParsePopularArticles(JObject articles)
105 | {
106 | List ret = new List();
107 |
108 | if (articles != null)
109 | {
110 | foreach (JObject article in (articles["articles"] as JArray).Take(25))
111 | {
112 | ret.Add(ParseArticleSummary(article));
113 | }
114 | }
115 | return ret;
116 | }
117 |
118 | private static ArticleSummary ParseArticleSummary(JObject summary)
119 | => (summary != null) ?
120 | new ArticleSummary
121 | {
122 | Title = StripNewlines(Cleanse(summary["normalizedtitle"])),
123 | Description = StripNewlines(Cleanse(summary["description"])),
124 | //already text formatted!
125 | Excerpt = StripNewlines(Cleanse(summary["extract"])),
126 | ThumbnailUrl = GetThumbnailUrl(summary["thumbnail"] as JObject)
127 | } : null;
128 |
129 | private static string GetThumbnailUrl(JObject thumb)
130 | {
131 | //result["thumbnail"]?["url"]? doesn't seem to work
132 | if (thumb != null)
133 | {
134 | var url = thumb["url"]?.ToString() ??
135 | thumb["source"]?.ToString() ?? "";
136 | if (url.Length > 0)
137 | {
138 | return CommonUtils.EnsureHttps(url);
139 | }
140 | }
141 |
142 | return "";
143 | }
144 |
145 | private static string StripNewlines(string s)
146 | => s.Replace("\r\n", " ").Replace("\r", " ").Replace("\n", " ").Trim();
147 |
148 | private static string Cleanse(JToken token)
149 | => token?.ToString() ?? "";
150 |
151 | private static JObject ParseJson(string json)
152 | => JObject.Parse(json);
153 |
154 | private static string StripHtml(string s)
155 | => WebUtility.HtmlDecode(Regex.Replace(s, @"<[^>]*>", "")) + "...";
156 | }
157 |
158 |
--------------------------------------------------------------------------------
/Gemipedia/Renderer/ArticleRenderer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Linq;
4 | using Gemipedia.Models;
5 |
6 | namespace Gemipedia.Renderer;
7 |
8 | public class ArticleRenderer
9 | {
10 | TextWriter Writer;
11 | ParsedPage Page;
12 | int sectionID = 0;
13 | //grab and cache it
14 | string[] articleLinkSections = UserOptions.ArticleLinkSections;
15 |
16 | public void RenderArticle(ParsedPage parsedPage, TextWriter writer)
17 | {
18 | Writer = writer;
19 | Page = parsedPage;
20 |
21 | RenderArticleHeader();
22 | foreach (var section in parsedPage.Sections)
23 | {
24 | Writer.Write(RenderSection(section));
25 | }
26 | RenderArticleFooter(parsedPage);
27 | }
28 |
29 | private void RenderArticleHeader()
30 | {
31 | Writer.WriteLine($"# {Page.Title}");
32 | int count = Page.GetAllImages().Count;
33 | if (count > 0)
34 | {
35 | Writer.WriteLine($"=> {RouteOptions.ImageGalleryUrl(Page.Title)} Gallery: {count} images");
36 | }
37 | //TODO: Geo here!
38 | Writer.WriteLine($"=> {RouteOptions.SearchUrl(Page.Title)} Other articles that mention '{Page.Title}'");
39 | Writer.WriteLine();
40 | }
41 |
42 | private void RenderArticleFooter(ParsedPage parsedPage)
43 | {
44 | Writer.WriteLine();
45 | Writer.WriteLine("## Article Resources");
46 | Writer.WriteLine($"=> {RouteOptions.ReferencesUrl(Page.Title)} List of all {parsedPage.GetReferenceCount()} referenced articles");
47 | Writer.WriteLine($"=> {RouteOptions.SearchUrl(Page.Title)} Search for articles that mention '{Page.Title}'");
48 | Writer.WriteLine($"=> {RouteOptions.OtherLanguagesUrl(Page.Title)} Read this article in another language");
49 | Writer.WriteLine($"=> {RouteOptions.PdfUrl(Page.EscapedTitle)} Download article PDF for offline access");
50 | Writer.WriteLine($"=> {RouteOptions.WikipediaSourceUrl(Page.EscapedTitle)} Source on Wikipedia website");
51 | }
52 |
53 | public void RenderInfobox(SimpleBuffer buffer, InfoboxItem infobox)
54 | {
55 | var title = string.IsNullOrEmpty(infobox.CustomTitle)
56 | ? "Quick Facts" :
57 | $"Quick Facts: {infobox.CustomTitle}";
58 |
59 | buffer.EnsureAtLineStart();
60 | buffer.AppendLine($"## {title}");
61 |
62 | var navSuggestions = infobox.NavSuggestions;
63 | if (navSuggestions.Count() > 0)
64 | {
65 | //render navigation items at top
66 | foreach (var nav in navSuggestions)
67 | {
68 | ContentRenderer.RenderNavSuggestion(buffer, nav);
69 | }
70 | //add a blank link, since nav suggestion can be long
71 | buffer.AppendLine();
72 | }
73 |
74 | foreach (var geo in infobox.GeoItems)
75 | {
76 | ContentRenderer.RenderGeo(buffer, geo);
77 | }
78 |
79 | foreach (var media in infobox.MediaItems)
80 | {
81 | ContentRenderer.RenderMedia(buffer, media as MediaItem);
82 | }
83 |
84 | buffer.EnsureAtLineStart();
85 | foreach (var item in infobox.ContentItems)
86 | {
87 | buffer.Append(item.Content);
88 | }
89 | }
90 |
91 | public string RenderSection(Section section)
92 | {
93 | sectionID++;
94 |
95 | SimpleBuffer buffer = new SimpleBuffer();
96 | if (section.HasNavSuggestions)
97 | {
98 | //render navigation items at top
99 | foreach (var nav in section.NavSuggestions)
100 | {
101 | ContentRenderer.RenderNavSuggestion(buffer, nav);
102 | }
103 | //add a blank link, since nav suggestion can be long
104 | buffer.AppendLine();
105 | }
106 |
107 | foreach (var geo in section.GeoItems)
108 | {
109 | ContentRenderer.RenderGeo(buffer, geo);
110 | }
111 |
112 | //other content below, in order
113 | foreach (SectionItem item in section.GeneralContent)
114 | {
115 | if (item is MediaItem)
116 | {
117 | ContentRenderer.RenderMedia(buffer, item as MediaItem);
118 | }
119 | else if (item is ContentItem)
120 | {
121 | buffer.Append(((ContentItem)item).Content);
122 | }
123 | }
124 | foreach (var infoBox in section.Infoboxes)
125 | {
126 | RenderInfobox(buffer, infoBox);
127 | }
128 |
129 | if (section.Links.HasLinks && !ShouldExcludeSectionIndex(section))
130 | {
131 | buffer.EnsureAtLineStart();
132 | buffer.AppendLine($"=> {RouteOptions.ReferencesUrl(Page.Title, sectionID)} Section links: ({section.Links.Count} Articles)");
133 | }
134 |
135 | foreach (var subSection in section.SubSections)
136 | {
137 | buffer.Append(RenderSection(subSection));
138 | }
139 |
140 | //if a section has no content, don't write anything
141 | if (!buffer.HasContent)
142 | {
143 | return "";
144 | }
145 |
146 | if (!section.IsSpecial)
147 | {
148 | if (section.SectionDepth == 2)
149 | {
150 | buffer.PrependLine($"## {section.Title}");
151 | }
152 | else
153 | {
154 | //all other sections are at a level 3
155 | buffer.PrependLine($"### {section.Title}");
156 | }
157 | }
158 | return buffer.Content;
159 | }
160 |
161 | private bool ShouldExcludeSectionIndex(Section section)
162 | => articleLinkSections.Contains(section.Title?.ToLower());
163 | }
164 |
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/Tables/TableParser.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text.RegularExpressions;
5 | using AngleSharp.Html.Dom;
6 | using Gemipedia.Models;
7 |
8 | namespace Gemipedia.Converter.Special.Tables;
9 |
10 | public class TableParser : IArticleLinks
11 | {
12 |
13 | Row currRow;
14 | Table table;
15 | TextExtractor textExtractor;
16 | //used when adding row/colspans to fix mismatched tables
17 | int currRowWidth;
18 |
19 | public ArticleLinkCollection Links { get; private set; }
20 |
21 | public TableParser()
22 | {
23 | table = new Table();
24 | textExtractor = new TextExtractor
25 | {
26 | ShouldConvertImages = true,
27 | ShouldCollapseNewlines = true
28 | };
29 | Links = new ArticleLinkCollection();
30 | }
31 |
32 | public Table ParseTable(HtmlElement element)
33 | {
34 | ParseChildren(element);
35 | AppendRow();
36 | //go back and place any rowspan placeholder cells
37 | UpdateForRowSpans();
38 | return table;
39 | }
40 |
41 | private void ParseChildren(HtmlElement element)
42 | => element.Children.ToList().ForEach(x => ParseTag((HtmlElement)x));
43 |
44 | private void ParseTag(HtmlElement current)
45 | {
46 |
47 | switch (current.NodeName.ToLower())
48 | {
49 | case "caption":
50 | textExtractor.Extract(current);
51 | table.Caption = textExtractor.Content;
52 | Links.Add(textExtractor);
53 | break;
54 |
55 | case "tr":
56 | {
57 | AppendRow();
58 | currRow = new Row();
59 | ParseChildren(current);
60 | break;
61 | }
62 |
63 | case "td":
64 | case "th":
65 | AddCell(current);
66 | break;
67 |
68 | //pass through
69 | case "tbody":
70 | case "tfoot":
71 | case "thead":
72 | ParseChildren(current);
73 | break;
74 | }
75 | }
76 |
77 | private void AppendRow()
78 | {
79 | if (currRow != null && !currRow.IsEmpty)
80 | {
81 | table.Rows.Add(currRow);
82 | }
83 | }
84 |
85 | private void AddCell(HtmlElement cell)
86 | {
87 | if (currRow != null)
88 | {
89 | textExtractor.Extract(cell);
90 | string contents = textExtractor.Content;
91 | Links.Add(textExtractor);
92 |
93 | currRow.Cells.Add(new Cell
94 | {
95 | IsHeader = (cell.NodeName == "TH"),
96 | Contents = contents,
97 | ColSpan = ParseSpan(cell.GetAttribute("colspan")),
98 | RowSpan = ParseSpan(cell.GetAttribute("rowspan")),
99 | IsRowSpanHolder = false
100 | });
101 | }
102 | }
103 |
104 | //parse the value of a row or column span. Browsers are support liberal on this
105 | // "3;" works. Defaults to 1 if you can't parse anything
106 | private int ParseSpan(string attribValue)
107 | {
108 | try
109 | {
110 | if (attribValue != null)
111 | {
112 | var match = Regex.Match(attribValue, @"^(\d+)");
113 | var value = match.Success ? Convert.ToInt32(match.Groups[1].Value) : 1;
114 | //colspan and rowspan must be >= 1
115 | return (value > 0) ? value : 1;
116 | }
117 | }
118 | catch (Exception)
119 | { }
120 | return 1;
121 | }
122 |
123 | private int RowWidthThrottle(int colSpan)
124 | {
125 | if (currRowWidth + colSpan <= table.MaxColumns)
126 | {
127 | currRowWidth += colSpan;
128 | return colSpan;
129 | }
130 | var newColspan = Math.Max((table.MaxColumns - currRowWidth), 1);
131 | currRowWidth += newColspan;
132 | return newColspan;
133 | }
134 |
135 |
136 | private void UpdateForRowSpans()
137 | {
138 | for (int rowIndex = 1; rowIndex < table.Rows.Count; rowIndex++)
139 | {
140 | List newRow = new List();
141 | Queue oldRow = new Queue(table.Rows[rowIndex].Cells);
142 | Queue prevRow = new Queue| (table.Rows[rowIndex - 1].Cells);
143 | currRowWidth = 0;
144 | while (prevRow.Count > 0)
145 | {
146 | var prevRowCell = prevRow.Dequeue();
147 |
148 | if (prevRowCell.RowSpan > 1)
149 | {
150 | //push on a placeholder
151 | newRow.Add(new Cell
152 | {
153 | IsRowSpanHolder = true,
154 | RowSpan = prevRowCell.RowSpan - 1,
155 | ColSpan = RowWidthThrottle(prevRowCell.ColSpan),
156 | IsHeader = prevRowCell.IsHeader,
157 | });
158 | }
159 | else
160 | {
161 | for (int i = 0; i < prevRowCell.ColSpan; i++)
162 | {
163 | //pull cell from current row == the colspan of
164 | if (oldRow.Count > 0)
165 | {
166 | var cell = oldRow.Dequeue();
167 | cell.ColSpan = RowWidthThrottle(cell.ColSpan);
168 | newRow.Add(cell);
169 | i += cell.ColSpan - 1;
170 | }
171 | }
172 | }
173 | }
174 | //There should not be anything left in oldRow. If so, the
175 | //number of cells in the source table were mismatched, so try
176 | //and handle that
177 | while (oldRow.Count > 0)
178 | {
179 | newRow.Add(oldRow.Dequeue());
180 | }
181 | table.Rows[rowIndex].Cells = newRow;
182 | }
183 | }
184 |
185 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/GeohackParser.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Specialized;
3 | using System.Text.RegularExpressions;
4 | using System.Web;
5 |
6 | namespace Gemipedia.Converter.Special;
7 |
8 | ///
9 | /// Parses the URLs used by Geohack.toolforge.org
10 | ///
11 | public class GeohackParser
12 | {
13 | public string ArticleName { get; private set; }
14 |
15 | public bool IsEarth
16 | => (Globe.ToLower() == "earth");
17 |
18 | public bool IsValid { get; private set; }
19 |
20 | public string Globe { get; private set; }
21 |
22 | public string GeohackUrl { get; private set; }
23 |
24 | public string Language { get; private set; }
25 |
26 | public double Latitude { get; private set; }
27 |
28 | public double Longitude { get; private set; }
29 |
30 | public string Title { get; private set; }
31 |
32 | public string Type { get; private set; }
33 |
34 | public string Coordinates { get; private set; }
35 |
36 | public string GetPrettyName()
37 | => Title.Length > 0 ? Title : ArticleName;
38 |
39 | public bool HasTypeDescription
40 | => GetTypeDescription().Length > 0;
41 |
42 | public string GetTypeDescription()
43 | {
44 | switch (Type)
45 | {
46 | case "airport":
47 | case "city":
48 | case "country":
49 | case "event":
50 | case "forest":
51 | case "glacier":
52 | case "landmark":
53 | case "montain":
54 | case "river":
55 | case "satellite":
56 | case "state":
57 | return Type.Substring(0, 1).ToUpper() + Type.Substring(1);
58 |
59 | case "edu":
60 | return "Educational Institute";
61 |
62 | case "railwaystation":
63 | return "Railway Station";
64 |
65 | case "adm1st":
66 | case "adm2nd":
67 | case "adm3rd":
68 | return "Municipality";
69 |
70 | case "waterbody":
71 | return "Body of water";
72 |
73 | default:
74 | return "";
75 |
76 | }
77 | }
78 |
79 | Regex DegreeMinuteSecondDirection = new Regex(@"([\d\.]+)_+(?:([\d\.]+)_)?(?:([\d\.]+)_+)?([NS])_([\d\.]+)_+(?:([\d\.]+)_+)?(?:([\d\.]+)_)?([EW])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
80 | Regex DegreeDirection = new Regex(@"([\-\.\d]+)_([NS])_([\-\.\d]+)_([EW])", RegexOptions.Compiled | RegexOptions.IgnoreCase);
81 |
82 | NameValueCollection QueryString;
83 |
84 | string ParamString => QueryString["params"];
85 |
86 | public GeohackParser(string geohackUrl)
87 | {
88 | if (!GeoParser.IsGeohackUrl(geohackUrl))
89 | {
90 | throw new ArgumentException("Not a Geohack url");
91 | }
92 |
93 | Uri url = new Uri(geohackUrl);
94 | GeohackUrl = url.AbsoluteUri;
95 |
96 | QueryString = HttpUtility.ParseQueryString(url.Query);
97 |
98 | IsValid = ParseLatLon();
99 | ArticleName = ParseArticleName();
100 | Globe = ExtractParam("globe") ?? "earth";
101 | Language = QueryString["language"] ?? "en";
102 | Title = QueryString["title"] ?? "";
103 | Type = ExtractParam("type");
104 | }
105 |
106 | private bool ParseLatLon()
107 | {
108 | if (DegreeMinuteSecondDirection.IsMatch(ParamString))
109 | {
110 | ParseDMSD(ParamString);
111 | return true;
112 | }
113 | if (DegreeDirection.IsMatch(ParamString))
114 | {
115 | ParseDD(ParamString);
116 | return true;
117 | }
118 | return false;
119 | }
120 |
121 | private string ParseArticleName()
122 | => QueryString["pagename"]?.Replace("_", " ") ?? "";
123 |
124 | private double NormalizeDMS(Group g)
125 | {
126 | var val = g.ToString();
127 | return val.Length > 0 ? Convert.ToDouble(val) : 0d;
128 | }
129 |
130 | private void ParseDMSD(string dms)
131 | {
132 | var match = DegreeMinuteSecondDirection.Match(dms);
133 |
134 | //DD = d + (min/60) + (sec/3600)
135 | Latitude = NormalizeDMS(match.Groups[1]) +
136 | NormalizeDMS(match.Groups[2]) / 60d +
137 | NormalizeDMS(match.Groups[3]) / 3600d;
138 |
139 | if (match.Groups[4].ToString().ToLower() == "s")
140 | {
141 | Latitude *= -1;
142 | }
143 |
144 | Longitude = NormalizeDMS(match.Groups[5]) +
145 | NormalizeDMS(match.Groups[6]) / 60d +
146 | NormalizeDMS(match.Groups[7]) / 3600d;
147 |
148 | if (match.Groups[8].ToString().ToLower() == "w")
149 | {
150 | Longitude *= -1;
151 | }
152 |
153 | Coordinates = string.Format("{0}{1}{2}{3} {4}{5}{6}{7}",
154 | FormatGroup(match.Groups[1], "°"),
155 | FormatGroup(match.Groups[2], "′"),
156 | FormatGroup(match.Groups[3], "″"),
157 | FormatGroup(match.Groups[4]),
158 | FormatGroup(match.Groups[5], "°"),
159 | FormatGroup(match.Groups[6], "′"),
160 | FormatGroup(match.Groups[7], "″"),
161 | FormatGroup(match.Groups[8]));
162 | }
163 |
164 | private string FormatGroup(Group g, string symbol = "")
165 | {
166 | var val = g.ToString();
167 | return val.Length > 0 ? $"{val}{symbol}" : "";
168 | }
169 |
170 | private void ParseDD(string dd)
171 | {
172 | var match = DegreeDirection.Match(dd);
173 |
174 | //DD = d + (min/60) + (sec/3600)
175 | Latitude = Convert.ToDouble(match.Groups[1].ToString());
176 |
177 | if (match.Groups[2].ToString().ToLower() == "s")
178 | {
179 | Latitude *= -1;
180 | }
181 |
182 | Longitude = Convert.ToDouble(match.Groups[3].ToString());
183 | if (match.Groups[4].ToString().ToLower() == "w")
184 | {
185 | Longitude *= -1;
186 | }
187 | Coordinates = string.Format("{0}°{1} {2}°{3}",
188 | match.Groups[1], match.Groups[2],
189 | match.Groups[3], match.Groups[4]);
190 | }
191 |
192 | private string ExtractParam(string paramName)
193 | {
194 | var match = Regex.Match(ParamString, @$"_?{paramName}\:([a-zA-Z0-9]+)_?");
195 | if (match.Success && match.Groups.Count > 1)
196 | {
197 | return match.Groups[1].ToString();
198 | }
199 | return null;
200 | }
201 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/Tables/TableRenderer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Gemipedia.Converter.Special.Tables;
6 |
7 | public class TableRenderer
8 | {
9 | int ColumnWidth = 0;
10 |
11 | Table Table;
12 | StringBuilder buffer;
13 |
14 | private TableRenderer(Table table)
15 | {
16 | Table = table;
17 | buffer = new StringBuilder();
18 | }
19 |
20 | private string Render()
21 | {
22 | if (Table.HasCaption)
23 | {
24 | buffer.AppendLine($"### Table: {Table.Caption}");
25 | }
26 | buffer.AppendLine("```Table");
27 | buffer.AppendLine(GenerateDividerLine(Table.Rows[0], true));
28 |
29 | for (int i = 0; i < Table.Rows.Count; i++)
30 | {
31 | var row = Table.Rows[i];
32 | RenderRow(row);
33 | //are we on the last row?
34 | buffer.AppendLine(GenerateDividerLine(row, (i + 1) == Table.Rows.Count));
35 | }
36 | buffer.AppendLine("```");
37 | return buffer.ToString();
38 | }
39 |
40 | private void RenderRow(Row row)
41 | {
42 | for (int lineNum = 0, max = row.LineHeight; lineNum < max; lineNum++)
43 | {
44 | StringBuilder lineBuffer = new StringBuilder();
45 | for (int cellIndex = 0; cellIndex < row.Cells.Count; cellIndex++)
46 | {
47 | //leading edge
48 | if (cellIndex == 0)
49 | {
50 | lineBuffer.Append("|");
51 | }
52 | lineBuffer.Append(row.Cells[cellIndex].FormattedLines[lineNum]);
53 | lineBuffer.Append("|");
54 | }
55 | buffer.AppendLine(lineBuffer.ToString());
56 | }
57 | }
58 |
59 | private string GenerateDividerLine(Row row, bool IsEdge = false)
60 | {
61 | StringBuilder sb = new StringBuilder();
62 | sb.Append('+');
63 | for (int i = 0; i < row.Cells.Count; i++)
64 | {
65 | //do we need to leave it open or draw a horizontal line?
66 | //for the top/bottom edges, we always draw the line
67 | var cell = row.Cells[i];
68 | if (!IsEdge && cell.RowSpan > 1)
69 | {
70 | sb.Append(new string(' ', cell.FormattedWidth));
71 | }
72 | else
73 | {
74 | sb.Append(new string('-', cell.FormattedWidth));
75 | }
76 | //do we need to add some extra for the cells we skipped?
77 | sb.Append('+');
78 | }
79 | return sb.ToString();
80 | }
81 |
82 | private void FormatContents()
83 | {
84 | ColumnWidth = Math.Max((60 / Table.MaxColumns), 15);
85 |
86 | foreach (var row in Table.Rows)
87 | {
88 |
89 | foreach (var cell in row.Cells)
90 | {
91 | cell.FormattedLines = FormatCell(cell, ColumnWidth);
92 | }
93 |
94 | int maxHeight = row.LineHeight;
95 |
96 | foreach (var cell in row.Cells)
97 | {
98 | VerticalPad(cell, maxHeight, ColumnWidth);
99 | }
100 | }
101 | }
102 |
103 | private List FormatCell(Cell cell, int columnWidth)
104 | {
105 | //is this a rowspan placeholder?
106 | if (cell.IsRowSpanHolder)
107 | {
108 | return FormatPlaceholder(cell, columnWidth);
109 | }
110 |
111 | var input = cell.IsHeader ? cell.Contents.ToUpper() : cell.Contents;
112 | int maxWidth = (columnWidth * cell.ColSpan) + (cell.ColSpan - 1);
113 |
114 | List lines = new List();
115 |
116 | string[] words = input.Split(' ');
117 |
118 | string line = "";
119 | int lineLength = 0;
120 | foreach (string word in words)
121 | {
122 |
123 | int wordLength = UnicodeString.GetWidth(word);
124 | //do we have extra-wide characters?
125 | bool hasWideCharacters = (wordLength != word.Length);
126 | //can the word fit?
127 | if (wordLength > maxWidth)
128 | {
129 | //nope, we are going to need to hard slice this word to fit to the width
130 | //this is complex if we have wide characters
131 |
132 | //Step 1: flush anything still in the buffer
133 | if (lineLength > 0)
134 | {
135 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader));
136 | line = "";
137 | lineLength = 0;
138 | }
139 |
140 | //step 2: determine the amount of characters to use in each hard slice
141 | int substringLength = maxWidth;
142 | if (hasWideCharacters && word.Length < maxWidth)
143 | {
144 | //if we have wide characters, we need to do a smaller
145 | substringLength = word.Length / 2;
146 | }
147 |
148 | int start = 0;
149 | while (start < word.Length)
150 | {
151 | lines.Add(PadCell(word.Substring(start, Math.Min(substringLength, word.Length - start)), maxWidth, cell.IsHeader));
152 | start += substringLength;
153 | }
154 | continue;
155 | }
156 | //will the buffer be too big? if so, flush it
157 | if ((lineLength + wordLength) > maxWidth)
158 | {
159 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader));
160 | line = "";
161 | lineLength = 0;
162 | }
163 | line += word;
164 | lineLength += wordLength;
165 | if (wordLength + 1 <= maxWidth)
166 | {
167 | line += " ";
168 | lineLength += 1;
169 | }
170 | }
171 | //flush any remaining in buffer
172 | if (lineLength > 0)
173 | {
174 | lines.Add(PadCell(line.Trim(), maxWidth, cell.IsHeader));
175 | }
176 | return lines;
177 | }
178 |
179 | private List FormatPlaceholder(Cell cell, int columWidth)
180 | {
181 | int maxWidth = (columWidth * cell.ColSpan) + (cell.ColSpan - 1);
182 | var ret = new List();
183 | ret.Add(new string(' ', maxWidth));
184 | return ret;
185 | }
186 |
187 | private string PadCell(string s, int length, bool center)
188 | {
189 | int counter = 0;
190 | int initialLength = UnicodeString.GetWidth(s);
191 | int addedLength = 0;
192 | for (; initialLength + addedLength < length;)
193 | {
194 | counter++;
195 | if (center && counter % 2 == 1)
196 | {
197 | s = " " + s;
198 | }
199 | else
200 | {
201 | s += " ";
202 | }
203 | addedLength++;
204 | }
205 | return s;
206 | }
207 |
208 | private void VerticalPad(Cell cell, int lines, int width)
209 | {
210 | int maxWidth = (width * cell.ColSpan) + (cell.ColSpan - 1);
211 | for (; cell.FormattedLines.Count < lines;)
212 | {
213 | cell.FormattedLines.Add(new string(' ', maxWidth));
214 | }
215 | }
216 |
217 | public static string RenderTable(Table Table)
218 | {
219 | if (Table.IsEmpty)
220 | {
221 | return "";
222 | }
223 | var renderer = new TableRenderer(Table);
224 | renderer.FormatContents();
225 | return renderer.Render();
226 | }
227 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Sectionizer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Xml.Linq;
5 | using AngleSharp.Dom;
6 | using AngleSharp.Html.Dom;
7 | using Gemipedia.Models;
8 |
9 | namespace Gemipedia.Converter;
10 |
11 | ///
12 | /// Constructs a tree of sections and subsections for the Wiki content
13 | ///
14 | public class Sectionizer
15 | {
16 | Stack SectionStack;
17 |
18 | ParsedPage ParsedPage;
19 | //grab once and cache
20 | string[] excludedSections = UserOptions.ExcludedSections;
21 |
22 | public ParsedPage ParseContent(string title, INode contentRoot)
23 | {
24 | ParsedPage = new ParsedPage
25 | {
26 | Title = title
27 | };
28 |
29 | SectionStack = new Stack();
30 |
31 | SectionStack.Push(new Section
32 | {
33 | IsSpecial = true,
34 | SectionDepth = 2,
35 | });
36 |
37 | var nodeList = contentRoot.ChildNodes.ToArray();
38 |
39 | for (int currIndex = 0, len = nodeList.Length; currIndex < len; currIndex++)
40 | {
41 | INode currNode = contentRoot.ChildNodes[currIndex];
42 |
43 | HeadingInfo? headingInfo = GetIfHeading(currNode);
44 |
45 | //is it a normal node
46 | if (headingInfo != null)
47 | {
48 |
49 | //we are supposed to skip this?
50 | if (ShouldSkipSection(headingInfo))
51 | {
52 | currIndex = FastForward(currNode, nodeList, currIndex);
53 | continue;
54 | }
55 |
56 | int depthOnStack = SectionStack.Peek().SectionDepth;
57 | //normalize to H2
58 | if (headingInfo.Level < 2)
59 | {
60 | headingInfo.Level = 2;
61 | }
62 | if (headingInfo.Level > depthOnStack)
63 | {
64 | //ok push a new section
65 | PushNewSection(headingInfo);
66 | continue;
67 | }
68 | else if (headingInfo.Level == depthOnStack)
69 | {
70 | //pop the current section off
71 | AddCompletedSection(SectionStack.Pop());
72 | //push the new section
73 | PushNewSection(headingInfo);
74 | }
75 | else
76 | {
77 | //new section is
78 | //found one lower!
79 | //while the top of ths stacck is > the next one
80 | while (SectionStack.Peek().SectionDepth > headingInfo.Level)
81 | {
82 | var tmpSection = SectionStack.Pop();
83 | //add that as a subsection for the section of the top
84 | SectionStack.Peek().SubSections.Add(tmpSection);
85 | }
86 | //pop the current section off
87 | AddCompletedSection(SectionStack.Pop());
88 | //push the new section
89 | PushNewSection(headingInfo);
90 | }
91 | }
92 | else if (ShouldAddNode(currNode))
93 | {
94 | SectionStack.Peek().Nodes.Add(currNode);
95 | }
96 | }
97 | //combine remain stack
98 | while (SectionStack.Count > 0)
99 | {
100 | AddCompletedSection(SectionStack.Pop());
101 | }
102 | return ParsedPage;
103 | }
104 |
105 | private void AddCompletedSection(Section section)
106 | {
107 | //if there is still something on the stack, add it as a subsection
108 | if (SectionStack.Count > 0)
109 | {
110 | SectionStack.Peek().SubSections.Add(section);
111 | }
112 | else
113 | {
114 | ParsedPage.Sections.Add(section);
115 | }
116 | }
117 |
118 | private HeadingInfo? GetIfHeading(INode node)
119 | {
120 |
121 | if (node is not HtmlElement)
122 | {
123 | return null;
124 | }
125 |
126 | var htmlElement = node as HtmlElement;
127 |
128 | if (htmlElement.NodeName.Length == 2 &&
129 | htmlElement.NodeName[0] == 'H' &&
130 | char.IsDigit(htmlElement.NodeName[1]))
131 | {
132 | //traditional HTML used for a heading
133 | return new HeadingInfo
134 | {
135 | ID = htmlElement.QuerySelector("span.mw-headline").GetAttribute("id")?.ToLower() ?? "",
136 | Level = node.NodeName[1] - 48,
137 | Title = htmlElement.QuerySelector("span.mw-headline").TextContent.Trim().Replace("\n", "")
138 | };
139 | }
140 | //2024-07-21 : Sometime recently MediaWiki started output HTML with the header tags
141 | //wrapped in DIVs
142 | //TODO: I really should junk all this and operate on the WikiText directly...
143 | else if (htmlElement.NodeName == "DIV" &&
144 | htmlElement.ClassName != null &&
145 | htmlElement.ClassName.Contains("mw-heading") &&
146 | htmlElement.FirstElementChild != null &&
147 | htmlElement.FirstElementChild.NodeName.Length == 2 &&
148 | htmlElement.FirstElementChild.NodeName[0] == 'H' &&
149 | char.IsDigit(htmlElement.FirstElementChild.NodeName[1]))
150 | {
151 | //modern header
152 |
153 | return new HeadingInfo
154 | {
155 | ID = htmlElement.FirstElementChild.GetAttribute("id")?.ToLower() ?? "",
156 | Level = htmlElement.FirstElementChild.NodeName[1] - 48,
157 | Title = htmlElement.FirstElementChild.TextContent.Trim().Replace("\n", "")
158 | };
159 | }
160 | return null;
161 | }
162 |
163 | private void PushNewSection(HeadingInfo headingInfo)
164 | => SectionStack.Push(new Section
165 | {
166 | Title = headingInfo.Title,
167 | SectionDepth = headingInfo.Level
168 | });
169 |
170 |
171 | private bool ShouldAddNode(INode node)
172 | {
173 | switch (node.NodeType)
174 | {
175 | case NodeType.Text:
176 | if (node.TextContent.Trim().Length == 0)
177 | {
178 | return false;
179 | }
180 | return true;
181 |
182 | case NodeType.Element:
183 | return true;
184 |
185 | default:
186 | return false;
187 | }
188 | }
189 |
190 | private bool ShouldSkipSection(HeadingInfo headingInfo)
191 | => excludedSections.Contains(headingInfo.ID);
192 |
193 | ///
194 | /// Fast forwards to the next element of the type as the provided element
195 | ///
196 | ///
197 | ///
198 | ///
199 | private int FastForward(INode element, INode[] nodeList, int currentIndex)
200 | {
201 | int skipIndex = currentIndex + 1;
202 | //fast forward until we get to the next section
203 | for (; skipIndex < nodeList.Length; skipIndex++)
204 | {
205 | if ((nodeList[skipIndex].NodeType == element.NodeType) && (nodeList[skipIndex]).NodeName == element.NodeName)
206 | {
207 | break;
208 | }
209 | }
210 | return skipIndex - 1;
211 | }
212 | }
213 |
214 | internal class HeadingInfo
215 | {
216 | public string Title { get; set; }
217 | public string ID { get; set; }
218 | public int Level { get; set; }
219 | }
--------------------------------------------------------------------------------
/Gemipedia/Converter/Special/MediaParser.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using AngleSharp.Dom;
4 | using Gemipedia.Models;
5 |
6 | namespace Gemipedia.Converter.Special;
7 |
8 | ///
9 | /// Converts the various image widgets
10 | ///
11 | public static class MediaParser
12 | {
13 | static int montageNumber = 1;
14 | static int galleryNumber = 1;
15 |
16 | static TextExtractor textExtractor = new TextExtractor
17 | {
18 | ShouldCollapseNewlines = true
19 | };
20 |
21 | public static MediaItem ConvertMedia(IElement imageContainer, IElement captionContainer)
22 | => IsVideo(imageContainer) ?
23 | ConvertVideo(imageContainer, captionContainer) :
24 | ConvertImage(imageContainer, captionContainer);
25 |
26 | public static MediaItem ConvertTimelineInTable(IElement element)
27 | {
28 | var timeline = element.QuerySelector("div.timeline-wrapper");
29 | if (timeline != null)
30 | {
31 | //attempt to get a meaningful title for the timeline from the first cell
32 | textExtractor.Extract(element.QuerySelector("th"), element.QuerySelector("td"));
33 |
34 | return ConvertTimeline(timeline, textExtractor);
35 | }
36 | return null;
37 | }
38 |
39 | public static MediaItem ConvertTimeline(IElement timelineWrapper, ITextContent textContent = null)
40 | {
41 | var img = timelineWrapper.QuerySelector("img[usemap]");
42 | var title = (textContent != null) ? $"Timeline Image: {textContent.Content}" : "Timeline Image";
43 |
44 | if (img != null)
45 | {
46 | var media = new MediaItem
47 | {
48 | Url = RouteOptions.MediaProxyUrl(CommonUtils.GetImageUrl(img)),
49 | Caption = title
50 | };
51 | //add anything from
52 | if (textContent != null)
53 | {
54 | media.Links.Add(textContent.Links);
55 | }
56 | //try and add links from any areas to it
57 | timelineWrapper.QuerySelectorAll("map area")
58 | .ToList().ForEach(x => media.Links.Add(x));
59 |
60 | return media;
61 |
62 | }
63 | return null;
64 | }
65 |
66 | public static IEnumerable ConvertGallery(IElement gallery)
67 | {
68 | List ret = new List();
69 | int imageNumber = 0;
70 | foreach (var galleryItem in gallery.QuerySelectorAll("li.gallerybox"))
71 | {
72 | imageNumber++;
73 | var media = ConvertImage(galleryItem, galleryItem.QuerySelector(".gallerytext"));
74 | if (media != null)
75 | {
76 | //prefix it
77 | media.Caption = $"Gallery {galleryNumber}, Image {imageNumber}: {media.Caption}";
78 | ret.Add(media);
79 | }
80 | }
81 | galleryNumber++;
82 | return ret;
83 | }
84 |
85 | private static MediaItem ConvertImage(IElement imageContainer, IElement? captionContainer, string defaultText = "Article Image")
86 | {
87 | //some image holders can contain | | | | | | | |