From a55ed30e3d4f9519662bd3d43fe56e821ee1f6ab Mon Sep 17 00:00:00 2001 From: Niklas Fasching Date: Tue, 11 Dec 2018 22:05:57 +0100 Subject: [PATCH] Add support for org-entities (e.g. ndash, mdash, \Aacute) --- README.org | 2 - org/html.go | 7 + org/html_entity.go | 437 ++++++++++++++++++++++++++++++++++++ org/testdata/blocks.html | 2 +- org/testdata/headlines.html | 2 +- org/testdata/misc.html | 32 ++- org/testdata/misc.org | 9 + 7 files changed, 485 insertions(+), 6 deletions(-) create mode 100644 org/html_entity.go diff --git a/README.org b/README.org index 8d24ee4..d5f6d43 100644 --- a/README.org +++ b/README.org @@ -4,8 +4,6 @@ A basic org-mode parser in go - have a org-mode AST to play around with building an org-mode language server - hopefully add reasonable org-mode support to hugo - sadly [[https://github.com/chaseadamsio/goorgeous][goorgeous]] is broken & abandoned * next -*** TODO [[https://github.com/chaseadamsio/goorgeous/issues/46][#46]]: Support for symbols like ndash and mdash -- see org-entities replacement: see org-entities-help *** TODO [[https://github.com/chaseadamsio/goorgeous/issues/10][#10]]: Support noexport *** TODO [[https://github.com/chaseadamsio/goorgeous/issues/31][#31]]: Support #+INCLUDE - see https://orgmode.org/manual/Include-files.html diff --git a/org/html.go b/org/html.go index 8b212ca..e58e39e 100644 --- a/org/html.go +++ b/org/html.go @@ -58,6 +58,13 @@ func (w *HTMLWriter) before(d *Document) {} func (w *HTMLWriter) after(d *Document) { w.writeFootnotes(d) + w.replaceHTMLEntities(d) +} + +func (w *HTMLWriter) replaceHTMLEntities(d *Document) { + s := w.stringBuilder.String() + w.stringBuilder.Reset() + w.stringBuilder.WriteString(htmlEntityReplacer.Replace(s)) } func (w *HTMLWriter) writeNodes(ns ...Node) { diff --git a/org/html_entity.go b/org/html_entity.go new file mode 100644 index 0000000..484059b --- /dev/null +++ b/org/html_entity.go @@ -0,0 +1,437 @@ +package org + +import "strings" + +var htmlEntityReplacer *strings.Replacer + +func init() { + htmlEntities = append(htmlEntities, + "---", "—", + "--", "–", + "...", "…", + ) + htmlEntityReplacer = strings.NewReplacer(htmlEntities...) +} + +/* +Generated & copied over using the following elisp +(Setting up go generate seems like a waste for now - I call YAGNI on that one) + +(insert (mapconcat + (lambda (entity) (concat "`\\" (car entity) "`, `" (nth 6 entity) "`")) ; entity -> utf8 + (remove-if-not 'listp org-entities) + ",\n")) +*/ +var htmlEntities = []string{ + `\Agrave`, `À`, + `\agrave`, `à`, + `\Aacute`, `Á`, + `\aacute`, `á`, + `\Acirc`, `Â`, + `\acirc`, `â`, + `\Amacr`, `Ã`, + `\amacr`, `ã`, + `\Atilde`, `Ã`, + `\atilde`, `ã`, + `\Auml`, `Ä`, + `\auml`, `ä`, + `\Aring`, `Å`, + `\AA`, `Å`, + `\aring`, `å`, + `\AElig`, `Æ`, + `\aelig`, `æ`, + `\Ccedil`, `Ç`, + `\ccedil`, `ç`, + `\Egrave`, `È`, + `\egrave`, `è`, + `\Eacute`, `É`, + `\eacute`, `é`, + `\Ecirc`, `Ê`, + `\ecirc`, `ê`, + `\Euml`, `Ë`, + `\euml`, `ë`, + `\Igrave`, `Ì`, + `\igrave`, `ì`, + `\Iacute`, `Í`, + `\iacute`, `í`, + `\Icirc`, `Î`, + `\icirc`, `î`, + `\Iuml`, `Ï`, + `\iuml`, `ï`, + `\Ntilde`, `Ñ`, + `\ntilde`, `ñ`, + `\Ograve`, `Ò`, + `\ograve`, `ò`, + `\Oacute`, `Ó`, + `\oacute`, `ó`, + `\Ocirc`, `Ô`, + `\ocirc`, `ô`, + `\Otilde`, `Õ`, + `\otilde`, `õ`, + `\Ouml`, `Ö`, + `\ouml`, `ö`, + `\Oslash`, `Ø`, + `\oslash`, `ø`, + `\OElig`, `Œ`, + `\oelig`, `œ`, + `\Scaron`, `Š`, + `\scaron`, `š`, + `\szlig`, `ß`, + `\Ugrave`, `Ù`, + `\ugrave`, `ù`, + `\Uacute`, `Ú`, + `\uacute`, `ú`, + `\Ucirc`, `Û`, + `\ucirc`, `û`, + `\Uuml`, `Ü`, + `\uuml`, `ü`, + `\Yacute`, `Ý`, + `\yacute`, `ý`, + `\Yuml`, `Ÿ`, + `\yuml`, `ÿ`, + `\fnof`, `ƒ`, + `\real`, `ℜ`, + `\image`, `ℑ`, + `\weierp`, `℘`, + `\ell`, `ℓ`, + `\imath`, `ı`, + `\jmath`, `ȷ`, + `\Alpha`, `Α`, + `\alpha`, `α`, + `\Beta`, `Β`, + `\beta`, `β`, + `\Gamma`, `Γ`, + `\gamma`, `γ`, + `\Delta`, `Δ`, + `\delta`, `δ`, + `\Epsilon`, `Ε`, + `\epsilon`, `ε`, + `\varepsilon`, `ε`, + `\Zeta`, `Ζ`, + `\zeta`, `ζ`, + `\Eta`, `Η`, + `\eta`, `η`, + `\Theta`, `Θ`, + `\theta`, `θ`, + `\thetasym`, `ϑ`, + `\vartheta`, `ϑ`, + `\Iota`, `Ι`, + `\iota`, `ι`, + `\Kappa`, `Κ`, + `\kappa`, `κ`, + `\Lambda`, `Λ`, + `\lambda`, `λ`, + `\Mu`, `Μ`, + `\mu`, `μ`, + `\nu`, `ν`, + `\Nu`, `Ν`, + `\Xi`, `Ξ`, + `\xi`, `ξ`, + `\Omicron`, `Ο`, + `\omicron`, `ο`, + `\Pi`, `Π`, + `\pi`, `π`, + `\Rho`, `Ρ`, + `\rho`, `ρ`, + `\Sigma`, `Σ`, + `\sigma`, `σ`, + `\sigmaf`, `ς`, + `\varsigma`, `ς`, + `\Tau`, `Τ`, + `\Upsilon`, `Υ`, + `\upsih`, `ϒ`, + `\upsilon`, `υ`, + `\Phi`, `Φ`, + `\phi`, `ɸ`, + `\varphi`, `φ`, + `\Chi`, `Χ`, + `\chi`, `χ`, + `\acutex`, `𝑥́`, + `\Psi`, `Ψ`, + `\psi`, `ψ`, + `\tau`, `τ`, + `\Omega`, `Ω`, + `\omega`, `ω`, + `\piv`, `ϖ`, + `\varpi`, `ϖ`, + `\partial`, `∂`, + `\alefsym`, `ℵ`, + `\aleph`, `ℵ`, + `\gimel`, `ℷ`, + `\beth`, `ב`, + `\dalet`, `ד`, + `\ETH`, `Ð`, + `\eth`, `ð`, + `\THORN`, `Þ`, + `\thorn`, `þ`, + `\dots`, `…`, + `\cdots`, `⋯`, + `\hellip`, `…`, + `\middot`, `·`, + `\iexcl`, `¡`, + `\iquest`, `¿`, + `\shy`, ``, + `\ndash`, `–`, + `\mdash`, `—`, + `\quot`, `"`, + `\acute`, `´`, + `\ldquo`, `“`, + `\rdquo`, `”`, + `\bdquo`, `„`, + `\lsquo`, `‘`, + `\rsquo`, `’`, + `\sbquo`, `‚`, + `\laquo`, `«`, + `\raquo`, `»`, + `\lsaquo`, `‹`, + `\rsaquo`, `›`, + `\circ`, `∘`, + `\vert`, `|`, + `\vbar`, `|`, + `\brvbar`, `¦`, + `\S`, `§`, + `\sect`, `§`, + `\amp`, `&`, + `\lt`, `<`, + `\gt`, `>`, + `\tilde`, `~`, + `\slash`, `/`, + `\plus`, `+`, + `\under`, `_`, + `\equal`, `=`, + `\asciicirc`, `^`, + `\dagger`, `†`, + `\dag`, `†`, + `\Dagger`, `‡`, + `\ddag`, `‡`, + `\nbsp`, ` `, + `\ensp`, ` `, + `\emsp`, ` `, + `\thinsp`, ` `, + `\curren`, `¤`, + `\cent`, `¢`, + `\pound`, `£`, + `\yen`, `¥`, + `\euro`, `€`, + `\EUR`, `€`, + `\dollar`, `$`, + `\USD`, `$`, + `\copy`, `©`, + `\reg`, `®`, + `\trade`, `™`, + `\minus`, `−`, + `\pm`, `±`, + `\plusmn`, `±`, + `\times`, `×`, + `\frasl`, `⁄`, + `\colon`, `:`, + `\div`, `÷`, + `\frac12`, `½`, + `\frac14`, `¼`, + `\frac34`, `¾`, + `\permil`, `‰`, + `\sup1`, `¹`, + `\sup2`, `²`, + `\sup3`, `³`, + `\radic`, `√`, + `\sum`, `∑`, + `\prod`, `∏`, + `\micro`, `µ`, + `\macr`, `¯`, + `\deg`, `°`, + `\prime`, `′`, + `\Prime`, `″`, + `\infin`, `∞`, + `\infty`, `∞`, + `\prop`, `∝`, + `\propto`, `∝`, + `\not`, `¬`, + `\neg`, `¬`, + `\land`, `∧`, + `\wedge`, `∧`, + `\lor`, `∨`, + `\vee`, `∨`, + `\cap`, `∩`, + `\cup`, `∪`, + `\smile`, `⌣`, + `\frown`, `⌢`, + `\int`, `∫`, + `\therefore`, `∴`, + `\there4`, `∴`, + `\because`, `∵`, + `\sim`, `∼`, + `\cong`, `≅`, + `\simeq`, `≅`, + `\asymp`, `≈`, + `\approx`, `≈`, + `\ne`, `≠`, + `\neq`, `≠`, + `\equiv`, `≡`, + `\triangleq`, `≜`, + `\le`, `≤`, + `\leq`, `≤`, + `\ge`, `≥`, + `\geq`, `≥`, + `\lessgtr`, `≶`, + `\lesseqgtr`, `⋚`, + `\ll`, `≪`, + `\Ll`, `⋘`, + `\lll`, `⋘`, + `\gg`, `≫`, + `\Gg`, `⋙`, + `\ggg`, `⋙`, + `\prec`, `≺`, + `\preceq`, `≼`, + `\preccurlyeq`, `≼`, + `\succ`, `≻`, + `\succeq`, `≽`, + `\succcurlyeq`, `≽`, + `\sub`, `⊂`, + `\subset`, `⊂`, + `\sup`, `⊃`, + `\supset`, `⊃`, + `\nsub`, `⊄`, + `\sube`, `⊆`, + `\nsup`, `⊅`, + `\supe`, `⊇`, + `\setminus`, `⧵`, + `\forall`, `∀`, + `\exist`, `∃`, + `\exists`, `∃`, + `\nexist`, `∄`, + `\nexists`, `∄`, + `\empty`, `∅`, + `\emptyset`, `∅`, + `\isin`, `∈`, + `\in`, `∈`, + `\notin`, `∉`, + `\ni`, `∋`, + `\nabla`, `∇`, + `\ang`, `∠`, + `\angle`, `∠`, + `\perp`, `⊥`, + `\parallel`, `∥`, + `\sdot`, `⋅`, + `\cdot`, `⋅`, + `\lceil`, `⌈`, + `\rceil`, `⌉`, + `\lfloor`, `⌊`, + `\rfloor`, `⌋`, + `\lang`, `⟨`, + `\rang`, `⟩`, + `\langle`, `⟨`, + `\rangle`, `⟩`, + `\hbar`, `ℏ`, + `\mho`, `℧`, + `\larr`, `←`, + `\leftarrow`, `←`, + `\gets`, `←`, + `\lArr`, `⇐`, + `\Leftarrow`, `⇐`, + `\uarr`, `↑`, + `\uparrow`, `↑`, + `\uArr`, `⇑`, + `\Uparrow`, `⇑`, + `\rarr`, `→`, + `\to`, `→`, + `\rightarrow`, `→`, + `\rArr`, `⇒`, + `\Rightarrow`, `⇒`, + `\darr`, `↓`, + `\downarrow`, `↓`, + `\dArr`, `⇓`, + `\Downarrow`, `⇓`, + `\harr`, `↔`, + `\leftrightarrow`, `↔`, + `\hArr`, `⇔`, + `\Leftrightarrow`, `⇔`, + `\crarr`, `↵`, + `\hookleftarrow`, `↵`, + `\arccos`, `arccos`, + `\arcsin`, `arcsin`, + `\arctan`, `arctan`, + `\arg`, `arg`, + `\cos`, `cos`, + `\cosh`, `cosh`, + `\cot`, `cot`, + `\coth`, `coth`, + `\csc`, `csc`, + `\deg`, `deg`, + `\det`, `det`, + `\dim`, `dim`, + `\exp`, `exp`, + `\gcd`, `gcd`, + `\hom`, `hom`, + `\inf`, `inf`, + `\ker`, `ker`, + `\lg`, `lg`, + `\lim`, `lim`, + `\liminf`, `liminf`, + `\limsup`, `limsup`, + `\ln`, `ln`, + `\log`, `log`, + `\max`, `max`, + `\min`, `min`, + `\Pr`, `Pr`, + `\sec`, `sec`, + `\sin`, `sin`, + `\sinh`, `sinh`, + `\sup`, `sup`, + `\tan`, `tan`, + `\tanh`, `tanh`, + `\bull`, `•`, + `\bullet`, `•`, + `\star`, `⋆`, + `\lowast`, `∗`, + `\ast`, `*`, + `\odot`, `ʘ`, + `\oplus`, `⊕`, + `\otimes`, `⊗`, + `\check`, `✓`, + `\checkmark`, `✓`, + `\para`, `¶`, + `\ordf`, `ª`, + `\ordm`, `º`, + `\cedil`, `¸`, + `\oline`, `‾`, + `\uml`, `¨`, + `\zwnj`, `‌`, + `\zwj`, `‍`, + `\lrm`, `‎`, + `\rlm`, `‏`, + `\smiley`, `☺`, + `\blacksmile`, `☻`, + `\sad`, `☹`, + `\frowny`, `☹`, + `\clubs`, `♣`, + `\clubsuit`, `♣`, + `\spades`, `♠`, + `\spadesuit`, `♠`, + `\hearts`, `♥`, + `\heartsuit`, `♥`, + `\diams`, `◆`, + `\diamondsuit`, `◆`, + `\diamond`, `◆`, + `\Diamond`, `◆`, + `\loz`, `⧫`, + `\_ `, ` `, + `\_ `, `  `, + `\_ `, `   `, + `\_ `, `    `, + `\_ `, `     `, + `\_ `, `      `, + `\_ `, `       `, + `\_ `, `        `, + `\_ `, `         `, + `\_ `, `          `, + `\_ `, `           `, + `\_ `, `            `, + `\_ `, `             `, + `\_ `, `              `, + `\_ `, `               `, + `\_ `, `                `, + `\_ `, `                 `, + `\_ `, `                  `, + `\_ `, `                   `, + `\_ `, `                    `, +} diff --git a/org/testdata/blocks.html b/org/testdata/blocks.html index 1e26208..b90c498 100644 --- a/org/testdata/blocks.html +++ b/org/testdata/blocks.html @@ -71,7 +71,7 @@ paragraphs
  • -... +…

  • diff --git a/org/testdata/headlines.html b/org/testdata/headlines.html index 0f4b575..d79b9f5 100644 --- a/org/testdata/headlines.html +++ b/org/testdata/headlines.html @@ -3,5 +3,5 @@

    Headline with TODO status

    Headline with tags & priority

    -this one is cheating a little as tags are ALWAYS printed right aligned to a given column number... +this one is cheating a little as tags are ALWAYS printed right aligned to a given column number…

    diff --git a/org/testdata/misc.html b/org/testdata/misc.html index f707a61..7542dd0 100644 --- a/org/testdata/misc.html +++ b/org/testdata/misc.html @@ -28,6 +28,34 @@ crazy ain't it? +

    #46: Support for symbols like ndash and mdash

    +

    #47: Consecutive code wrapped text gets joined

    either this or that foo. @@ -89,7 +117,7 @@ sub bullet -

    #77: Recognize code--- as code plus dash

    +

    #77: Recognize code— as code plus dash

    #78: Emphasis at beginning of line

    italics @@ -101,7 +129,7 @@ Text

    #82: Crash on empty headline

    -just a space as title... +just a space as title…

    #84: Paragraphs that are not followed by an empty line are not parsed correctly

    Foo

    diff --git a/org/testdata/misc.org b/org/testdata/misc.org index 95a248a..40c5349 100644 --- a/org/testdata/misc.org +++ b/org/testdata/misc.org @@ -17,6 +17,15 @@ crazy ain't it? | *foo* | foo | | *bar* | bar | #+HTML: +*** DONE [[https://github.com/chaseadamsio/goorgeous/issues/46][#46]]: Support for symbols like ndash and mdash +- ndash -- +- mdash --- +- ellipsis ... +- acute \Aacute and so on +- note that ------ is replaced with 2 mdashes and .... becomes ellipsis+. and so on - that's how org also does it + + + *** DONE [[https://github.com/chaseadamsio/goorgeous/issues/47][#47:]] Consecutive ~code~ wrapped text gets joined either ~this~ or ~that~ foo. either ~this~