using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using DiscordChatExporter.Core.Markdown.Internal; using DiscordChatExporter.Core.Markdown.Nodes; using Tyrrrz.Extensions; namespace DiscordChatExporter.Core.Markdown { // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible public static class MarkdownParser { private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant; /* Formatting */ // Capture any character until the earliest double asterisk not followed by an asterisk private static readonly IMatcher BoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "**", TextFormatting.Bold, Parse(m.Groups[1].Value))); // Capture any character until the earliest single asterisk not preceded or followed by an asterisk // Opening asterisk must not be followed by whitespace // Closing asterisk must not be preceeded by whitespace private static readonly IMatcher ItalicFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(?!\\s)(.+?)(? new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value))); // Capture any character until the earliest triple asterisk not followed by an asterisk private static readonly IMatcher ItalicBoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value, BoldFormattedNodeMatcher))); // Capture any character except underscore until an underscore // Closing underscore must not be followed by a word character private static readonly IMatcher ItalicAltFormattedNodeMatcher = new RegexMatcher( new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value))); // Capture any character until the earliest double underscore not followed by an underscore private static readonly IMatcher UnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "__", TextFormatting.Underline, Parse(m.Groups[1].Value))); // Capture any character until the earliest triple underscore not followed by an underscore private static readonly IMatcher ItalicUnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value, UnderlineFormattedNodeMatcher))); // Capture any character until the earliest double tilde private static readonly IMatcher StrikethroughFormattedNodeMatcher = new RegexMatcher( new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, Parse(m.Groups[1].Value))); // Capture any character until the earliest double pipe private static readonly IMatcher SpoilerFormattedNodeMatcher = new RegexMatcher( new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline), m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, Parse(m.Groups[1].Value))); /* Code blocks */ // Capture any character except backtick until a backtick // Whitespace surrounding content inside backticks is trimmed private static readonly IMatcher InlineCodeBlockNodeMatcher = new RegexMatcher( new Regex("`([^`]+)`", DefaultRegexOptions | RegexOptions.Singleline), m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value.Trim())); // Capture language identifier and then any character until the earliest triple backtick // Languge identifier is one word immediately after opening backticks, followed immediately by newline // Whitespace surrounding content inside backticks is trimmed private static readonly IMatcher MultilineCodeBlockNodeMatcher = new RegexMatcher( new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value.Trim())); /* Mentions */ // Capture @everyone private static readonly IMatcher EveryoneMentionNodeMatcher = new StringMatcher( "@everyone", s => new MentionNode(s, "everyone", MentionType.Meta)); // Capture @here private static readonly IMatcher HereMentionNodeMatcher = new StringMatcher( "@here", s => new MentionNode(s, "here", MentionType.Meta)); // Capture <@123456> or <@!123456> private static readonly IMatcher UserMentionNodeMatcher = new RegexMatcher( new Regex("<@!?(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User)); // Capture <#123456> private static readonly IMatcher ChannelMentionNodeMatcher = new RegexMatcher( new Regex("<#(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel)); // Capture <@&123456> private static readonly IMatcher RoleMentionNodeMatcher = new RegexMatcher( new Regex("<@&(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role)); /* Emojis */ // Capture any country flag emoji (two regional indicator surrogate pairs) // ... or "symbol/other" character // ... or surrogate pair // ... or digit followed by enclosing mark // (this does not match all emojis in Discord but it's reasonably accurate enough) private static readonly IMatcher StandardEmojiNodeMatcher = new RegexMatcher( new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|\\p{So}|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions), m => new EmojiNode(m.Value, m.Groups[1].Value)); // Capture <:lul:123456> or private static readonly IMatcher CustomEmojiNodeMatcher = new RegexMatcher( new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions), m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, !m.Groups[1].Value.IsEmpty())); /* Links */ // Capture [title](link) private static readonly IMatcher TitledLinkNodeMatcher = new RegexMatcher( new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions), m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value)); // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace private static readonly IMatcher AutoLinkNodeMatcher = new RegexMatcher( new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions), m => new LinkNode(m.Value, m.Groups[1].Value)); // Same as auto link but also surrounded by angular brackets private static readonly IMatcher HiddenLinkNodeMatcher = new RegexMatcher( new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions), m => new LinkNode(m.Value, m.Groups[1].Value)); /* Text */ // Capture the shrug emoticon // This escapes it from matching for formatting private static readonly IMatcher ShrugTextNodeMatcher = new StringMatcher( @"¯\_(ツ)_/¯", s => new TextNode(s)); // Capture any "symbol/other" character or surrogate pair preceeded by a backslash // This escapes it from matching for emoji private static readonly IMatcher EscapedSymbolTextNodeMatcher = new RegexMatcher( new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions), m => new TextNode(m.Value, m.Groups[1].Value)); // Capture any non-whitespace, non latin alphanumeric character preceeded by a backslash // This escapes it from matching for formatting or other tokens private static readonly IMatcher EscapedCharacterTextNodeMatcher = new RegexMatcher( new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions), m => new TextNode(m.Value, m.Groups[1].Value)); // Combine all matchers into one // Matchers that have similar patterns are ordered from most specific to least specific private static readonly IMatcher AggregateNodeMatcher = new AggregateMatcher( ItalicBoldFormattedNodeMatcher, ItalicUnderlineFormattedNodeMatcher, BoldFormattedNodeMatcher, ItalicFormattedNodeMatcher, UnderlineFormattedNodeMatcher, ItalicAltFormattedNodeMatcher, StrikethroughFormattedNodeMatcher, SpoilerFormattedNodeMatcher, MultilineCodeBlockNodeMatcher, InlineCodeBlockNodeMatcher, EveryoneMentionNodeMatcher, HereMentionNodeMatcher, UserMentionNodeMatcher, ChannelMentionNodeMatcher, RoleMentionNodeMatcher, StandardEmojiNodeMatcher, CustomEmojiNodeMatcher, TitledLinkNodeMatcher, AutoLinkNodeMatcher, HiddenLinkNodeMatcher, ShrugTextNodeMatcher, EscapedSymbolTextNodeMatcher, EscapedCharacterTextNodeMatcher); private static IReadOnlyList Parse(string input, IMatcher matcher) => matcher.MatchAll(input, s => new TextNode(s)).Select(r => r.Value).ToArray(); public static IReadOnlyList Parse(string input) => Parse(input, AggregateNodeMatcher); } }