Rebol [ Title: "Codec: HTML entities" Name: html-entities Type: module Version: 1.1.0 Date: 21-Oct-2025 Options: [delay] Purpose: "To decode HTML entities in a text" File: https://raw.githubusercontent.com/Oldes/Rebol3/master/src/mezz/codec-html-entities.reb Author: "Oldes" Rights: http://opensource.org/licenses/Apache-2.0 Usage: [ "Test: ♠ & ¢ and Δδ ¾" = decode 'html-entities {Test: ♠ & ¢ <a> and Δδ ¾} ] TODO: { *Encoder } ] html-entities: #[ ;@@ https://eastmanreference.com/list-of-html-entity-names-and-numbers ;-- Punctuation, programming, and other common symbols "lt" #"^(3C)" ; 60 Open tag "gt" #"^(3E)" ; 62 Close tag "quot" #"^(22)" ; 34 Double quote "apos" #"^(27)" ; 39 Apostrophe / single quote "amp" #"^(26)" ; 38 Ampersand "nbsp" #"^(A0)" ; 160 Space (non-breaking) "brvbar" #"^(A6)" ; 166 Broken bar "iexcl" #"^(A1)" ; 161 Upside down exclamation mark "iquest" #"^(BF)" ; 191 Upside down question mark "sect" #"^(A7)" ; 167 Section symbol "uml" #"^(A8)" ; 168 Umlaut "ordf" #"^(AA)" ; 170 Feminine ordinal indicator "ordm" #"^(BA)" ; 186 Masculine ordinal indicator "laquo" #"^(AB)" ; 171 Open double angles "raquo" #"^(BB)" ; 187 Close double angles "not" #"^(AC)" ; 172 Not sign "shy" #"^(AD)" ; 173 Soft hyphen "macr" #"^(AF)" ; 175 Overline "acute" #"^(B4)" ; 180 Acute accent "para" #"^(B6)" ; 182 Pilcrow (paragraph) "middot" #"^(B7)" ; 183 Georgian comma "cedil" #"^(B8)" ; 184 Cedilla ;-- Math symbols "minus" #"^(2212)" ; 8722 Minus sign (subtraction) "times" #"^(D7)" ; 215 Multiplication sign "divide" #"^(F7)" ; 247 Division sign "plusmn" #"^(B1)" ; 177 Plus / minus "le" #"^(2264)" ; 8804 Less or equal "ge" #"^(2265)" ; 8805 Greater or equal "sup1" #"^(B9)" ; 185 Superscript 1 "sup2" #"^(B2)" ; 178 Superscript 2 "sup3" #"^(B3)" ; 179 Superscript 3 "frac14" #"^(BC)" ; 188 1/4 "frac12" #"^(BD)" ; 189 1/2 "frac34" #"^(BE)" ; 190 3/4 "forall" #"^(2200)" ; 8704 For all "part" #"^(2202)" ; 8706 Part "exist" #"^(2203)" ; 8707 Exist "empty" #"^(2205)" ; 8709 Empty "nabla" #"^(2207)" ; 8711 Nabla "isin" #"^(2208)" ; 8712 Is in "notin" #"^(2209)" ; 8713 Not in "ni" #"^(220B)" ; 8715 Ni "prod" #"^(220F)" ; 8719 Product "sum" #"^(2211)" ; 8721 Sum "lowast" #"^(2217)" ; 8727 Asterisk (Lowast) "radic" #"^(221A)" ; 8730 Square root "prop" #"^(221D)" ; 8733 Proportional to "infin" #"^(221E)" ; 8734 Infinity "ang" #"^(2220)" ; 8736 Angle "and" #"^(2227)" ; 8743 And "or" #"^(2228)" ; 8744 Or "cap" #"^(2229)" ; 8745 Cap "cup" #"^(222A)" ; 8746 Cup "int" #"^(222B)" ; 8747 Integral "there4" #"^(2234)" ; 8756 Therefore "sim" #"^(223C)" ; 8764 Similar to "cong" #"^(2245)" ; 8773 Congurent to "asymp" #"^(2248)" ; 8776 Almost equal "ne" #"^(2260)" ; 8800 Not equal "equiv" #"^(2261)" ; 8801 Equivalent "sub" #"^(2282)" ; 8834 Subset of "sup" #"^(2283)" ; 8835 Superset of "nsub" #"^(2284)" ; 8836 Not subset of "sube" #"^(2286)" ; 8838 Subset or equal "supe" #"^(2287)" ; 8839 Superset or equal "oplus" #"^(2295)" ; 8853 Circled plus "otimes" #"^(2297)" ; 8855 Circled times ;-- Unit of measure symbols "deg" #"^(B0)" ; 176 Degrees "micro" #"^(B5)" ; 181 Micro ;-- Copyright, registered, trademark "copy" #"^(A9)" ; 169 Copyright "reg" #"^(AE)" ; 174 Registered trademark "trade" #"^(2122)" ; 8482 Trademark ;-- Currency symbols "curren" #"^(A4)" ; 164 Currency sign "cent" #"^(A2)" ; 162 Cents "pound" #"^(A3)" ; 163 British pounds "euro" #"^(20AC)" ; 8364 Euro "yen" #"^(A5)" ; 165 Yen ;-- Greek alphabet "Alpha" #"^(391)" ; 913 UPPERCASE ALPHA "alpha" #"^(3B1)" ; 945 lowercase alpha "Beta" #"^(392)" ; 914 UPPERCASE BETA "beta" #"^(3B2)" ; 946 lowercase beta "Gamma" #"^(393)" ; 915 UPPERCASE GAMMA "gamma" #"^(3B3)" ; 947 lowercase gamma "Delta" #"^(394)" ; 916 UPPERCASE DELTA "delta" #"^(3B4)" ; 948 lowercase delta "Epsilon" #"^(395)" ; 917 UPPERCASE EPSILON "epsilon" #"^(3B5)" ; 949 lowercase epsilon "Zeta" #"^(396)" ; 918 UPPERCASE ZETA "zeta" #"^(3B6)" ; 950 lowercase zeta "Eta" #"^(397)" ; 919 UPPERCASE ETA "eta" #"^(3B7)" ; 951 lowercase eta "Theta" #"^(398)" ; 920 UPPERCASE THETA "theta" #"^(3B8)" ; 952 lowercase theta "thetasym" #"^(3D1)" ; 977 alternate lowercase theta "Iota" #"^(399)" ; 921 UPPERCASE IOTA "iota" #"^(3B9)" ; 953 lowercase iota "Kappa" #"^(39A)" ; 922 UPPERCASE KAPPA "kappa" #"^(3BA)" ; 954 lowercase kappa "Lambda" #"^(39B)" ; 923 UPPERCASE LAMBDA "lambda" #"^(3BB)" ; 955 lowercase lambda "Mu" #"^(39C)" ; 924 UPPERCASE MU "mu" #"^(3BC)" ; 956 lowercase mu "Nu" #"^(39D)" ; 925 UPPERCASE NU "nu" #"^(3BD)" ; 957 lowercase nu "Xi" #"^(39E)" ; 926 UPPERCASE XI "xi" #"^(3BE)" ; 958 lowercase xi "Omicron" #"^(39F)" ; 927 UPPERCASE OMICRON "omicron" #"^(3BF)" ; 959 lowercase omicron "Pi" #"^(3A0)" ; 928 UPPERCASE PI "pi" #"^(3C0)" ; 960 lowercase pi "piv" #"^(3D6)" ; 982 alternative lowercase pi "Rho" #"^(3A1)" ; 929 UPPERCASE RHO "rho" #"^(3C1)" ; 961 lowercase rho "Sigma" #"^(3A3)" ; 931 UPPERCASE SIGMA "sigma" #"^(3C3)" ; 963 lowercase sigma "sigmaf" #"^(3C2)" ; 962 final form lowercase sigma "Tau" #"^(3A4)" ; 932 UPPERCASE TAU "tau" #"^(3C4)" ; 964 lowercase tau "Upsilon" #"^(3A5)" ; 933 UPPERCASE UPSILON "upsilon" #"^(3C5)" ; 965 lowercase upsilon "upsih" #"^(3D2)" ; 978 alternative lowercase upsilon "Phi" #"^(3A6)" ; 934 UPPERCASE PHI "phi" #"^(3C6)" ; 966 lowercase phi "Chi" #"^(3A7)" ; 935 UPPERCASE CHI "chi" #"^(3C7)" ; 967 lowercase chi "Psi" #"^(3A8)" ; 936 UPPERCASE PSI "psi" #"^(3C8)" ; 968 lowercase psi "Omega" #"^(3A9)" ; 937 UPPERCASE OMEGA "omega" #"^(3C9)" ; 969 lowercase omega ;-- Arrows "larr" #"^(2190)" ; 8592 Left arrow "uarr" #"^(2191)" ; 8593 Up arrow "rarr" #"^(2192)" ; 8594 Right arrow "darr" #"^(2193)" ; 8595 Down arrow "harr" #"^(2194)" ; 8596 Left & right arrow "crarr" #"^(21B5)" ; 8629 Carriage return arrow ;-- Spade, club, heart, diamond "spades" #"^(2660)" ; 9824 Spade "clubs" #"^(2663)" ; 9827 Club "hearts" #"^(2665)" ; 9829 Heart "diams" #"^(2666)" ; 9830 Diamond ;-- Accented letters "Agrave" #"^(C0)" ; 192 CAPITAL A GRAVE ACCENT "agrave" #"^(E0)" ; 224 lowercase a grave accent "Aacute" #"^(C1)" ; 193 CAPITAL A ACUTE ACCENT "aacute" #"^(E1)" ; 225 lowercase a acute accent "Acirc" #"^(C2)" ; 194 CAPITAL A CIRCUMFLEX ACCENT "acirc" #"^(E2)" ; 226 lowercase a circumflex accent "Atilde" #"^(C3)" ; 195 CAPITAL A TILDE ACCENT "atilde" #"^(E3)" ; 227 lowercase a tilde accent "Auml" #"^(C4)" ; 196 CAPITAL A UMLAUT ACCENT "auml" #"^(E4)" ; 228 lowercase a umlaut accent "Aring" #"^(C5)" ; 197 CAPITAL A RING ABOVE ACCENT "aring" #"^(E5)" ; 229 lowercase a ring accent "AElig" #"^(C6)" ; 198 CAPITAL AE "aelig" #"^(E6)" ; 230 lowercase ae "Ccedil" #"^(C7)" ; 199 CAPITAL C CEDILLA ACCENT "ccedil" #"^(E7)" ; 231 lowercase c cedilla accent "Egrave" #"^(C8)" ; 200 CAPITAL E GRAVE ACCENT "egrave" #"^(E8)" ; 232 lowercase e grave accent "Eacute" #"^(C9)" ; 201 CAPITAL E ACUTE ACCENT "eacute" #"^(E9)" ; 233 lowercase e acute accent "Ecirc" #"^(CA)" ; 202 CAPITAL E CIRCUMFLEX ACCENT "ecirc" #"^(EA)" ; 234 lowercase e circumflex accent "ecirc" #"^(EA)" ; 234 lowercase e circumflex accent "Euml" #"^(CB)" ; 203 CAPITAL E UMLAUT ACCENT "euml" #"^(EB)" ; 235 lowercase e umlaut accent "Igrave" #"^(CC)" ; 204 CAPITAL I GRAVE ACCENT "igrave" #"^(EC)" ; 236 lowercase i grave accent "Iacute" #"^(CD)" ; 205 CAPITAL I ACUTE ACCENT "iacute" #"^(ED)" ; 237 lowercase i acute accent "Icirc" #"^(CE)" ; 206 CAPITAL I CIRCUMFLEX ACCENT "icirc" #"^(EE)" ; 238 lowercase i circumflex accent "Iuml" #"^(CF)" ; 207 CAPITAL I UMLAUT ACCENT "iuml" #"^(EF)" ; 239 lowercase i umlaut accent "ETH" #"^(D0)" ; 208 CAPITAL ICELANDIC ETH "eth" #"^(F0)" ; 240 lowercase Icelandic eth "Ntilde" #"^(D1)" ; 209 CAPITAL N TILDE ACCENT "ntilde" #"^(F1)" ; 241 lowercase n tilde accent "Ograve" #"^(D2)" ; 210 CAPITAL O GRAVE ACCENT "ograve" #"^(F2)" ; 242 lowercase o grave accent "Oacute" #"^(D3)" ; 211 CAPITAL O ACUTE ACCENT "oacute" #"^(F3)" ; 243 lowercase o acute accent "Ocirc" #"^(D4)" ; 212 CAPITAL O CIRCUMFLEX ACCENT "ocirc" #"^(F4)" ; 244 lowercase o circumflex accent "Otilde" #"^(D5)" ; 213 CAPITAL O TILDE ACCENT "otilde" #"^(F5)" ; 245 lowercase o tilde accent "Ouml" #"^(D6)" ; 214 CAPITAL O UMLAUT ACCENT "ouml" #"^(F6)" ; 246 lowercase o umlaut accent "Oslash" #"^(D8)" ; 216 CAPITAL O SLASH ACCENT "oslash" #"^(F8)" ; 248 lowercase o slash "Ugrave" #"^(D9)" ; 217 CAPITAL U GRAVE ACCENT "ugrave" #"^(F9)" ; 249 lowercase u grave accent "Uacute" #"^(DA)" ; 218 CAPITAL U ACUTE ACCENT "uacute" #"^(FA)" ; 250 lowercase u acute accent "Ucirc" #"^(DB)" ; 219 CAPITAL U CIRCUMFLEX ACCENT "ucirc" #"^(FB)" ; 251 lowercase u circumflex accent "Uuml" #"^(DC)" ; 220 CAPITAL U UMLAUT "uuml" #"^(FC)" ; 252 lowercase u umlaut accent "Yacute" #"^(DD)" ; 221 CAPITAL Y ACUTE ACCENT "yacute" #"^(FD)" ; 253 lowercase y acute accent "yuml" #"^(FF)" ; 255 lowercase y umlaut accent "THORN" #"^(DE)" ; 222 CAPITAL ICELANDIC THORN "thorn" #"^(FE)" ; 254 lowercase Icelandic thorn "szlig" #"^(DF)" ; 223 lowercase German sharp s ;-- Miscellaneous "bull" #"^(2022)" ; 8226 Bullet "hellip" #"^(2026)" ; 8230 Horizontal ellipsis "fnof" #"^(192)" ; 402 lowercase Latin f with hook "perp" #"^(22A5)" ; 8869 Perpendicular "sdot" #"^(22C5)" ; 8901 Dot operator "OElig" #"^(152)" ; 338 UPPERCASE LATIN OE LIGATURE "oelig" #"^(153)" ; 339 lowercase Latin oe ligature "Scaron" #"^(160)" ; 352 UPPERCASE S WITH CARON "scaron" #"^(161)" ; 353 lowercase s with caron "Yuml" #"^(178)" ; 376 CAPITAL Y WITH DIAERES "circ" #"^(2C6)" ; 710 Circumflex accent "tilde" #"^(2DC)" ; 732 Tilde (different from the tilde my keyboard generates) "ndash" #"^(2013)" ; 8211 En dash "mdash" #"^(2014)" ; 8212 Em dash "lsquo" #"^(2018)" ; 8216 Left single quotation mark "rsquo" #"^(2019)" ; 8217 Right single quotation mark "sbquo" #"^(201A)" ; 8218 Single low-9 quotation mark "ldquo" #"^(201C)" ; 8220 Left double quotation mark "rdquo" #"^(201D)" ; 8221 Right double quotation mark "bdquo" #"^(201E)" ; 8222 Double low-9 quotation mark "dagger" #"^(2020)" ; 8224 Dagger "Dagger" #"^(2021)" ; 8225 Double dagger "permil" #"^(2030)" ; 8240 Per mille "prime" #"^(2032)" ; 8242 Minutes (Degrees) "Prime" #"^(2033)" ; 8243 Seconds (Degrees) "lsaquo" #"^(2039)" ; 8249 Single left angle quotation "rsaquo" #"^(2039)" ; 8249 Single right angle quotation "oline" #"^(203E)" ; 8254 Overline "lceil" #"^(2308)" ; 8968 Left ceiling "rceil" #"^(2309)" ; 8969 Right ceiling "lfloor" #"^(230A)" ; 8970 Left floor "rfloor" #"^(230B)" ; 8971 Right floor "loz" #"^(25CA)" ; 9674 Lozenge "ensp" #"^(2002)" ; 8194 En space "emsp" #"^(2003)" ; 8195 Em space "thinsp" #"^(2009)" ; 8201 Thin space "zwnj" #"^(200C)" ; 8204 Zero width non-joiner "zwj" #"^(200D)" ; 8205 Zero width joiner "lrm" #"^(200E)" ; 8206 Left-to-right mark "rlm" #"^(200F)" ; 8207 Right-to-left mark ] any-except-&: complement charset "&" alphanum: system/catalog/bitsets/alpha-numeric digits: system/catalog/bitsets/numeric hex-digits: system/catalog/bitsets/hex-digits register-codec [ name: 'html-entities type: 'text title: "Reserved characters in HTML" decode: func [ {Creates a new string with possible HTML entities converted to chars} text [string! binary! file!] /local out s e char ] [ case [ file? text [text: read/string text] binary? text [text: to string! text] ] out: make string! length? text parse text [ any [ s: some any-except-& e: ( append/part out s e ) | #"&" s: [ #"#" [ copy char 1 7 digits #";" ( char: attempt [to char! to integer! char] ) | #"x" copy char some hex-digits #";" ( char: attempt [to char! to integer! debase char 16] ) ] | copy char 1 10 alphanum #";" ( char: select/case html-entities char ) | (char: none) ] e: ( unless char [char: #"&" e: :s] append out char ) :e ] ] out ] ]