diff options
Diffstat (limited to 'maintenance/entities2literals.pl')
-rw-r--r-- | maintenance/entities2literals.pl | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/maintenance/entities2literals.pl b/maintenance/entities2literals.pl new file mode 100644 index 00000000..dd47f6bb --- /dev/null +++ b/maintenance/entities2literals.pl @@ -0,0 +1,276 @@ +#!/usr/bin/env perl +# Takes STDIN and converts Converts hexadecimal, decimal and named HTML +# entities to their respective literals. +# +# Usage: perl entities2literals.pl < file_to_convert [> outfile] +# Reference: http://www.w3.org/TR/REC-html40/sgml/entities.html +# Copyright 2005 Ævar Arnfjörð Bjarmason <avarab@gmail.com> No rights reserved + +use encoding 'utf8'; +use strict; + +my $file = join /\n/, <>; + +$file =~ s/&#(\d+);/chr $1/eg; +$file =~ s/&#x([0-9a-fA-F]+);/chr hex $1/eg; + +while (<DATA>) { + chomp; + my ($number, $entity) = split / +/; + $file =~ s/&$entity;/chr $number/eg; +} +print $file; + +__DATA__ +34 quot +38 amp +60 lt +62 gt +160 nbsp +161 iexcl +162 cent +163 pound +164 curren +165 yen +166 brvbar +167 sect +168 uml +169 copy +170 ordf +171 laquo +172 not +173 shy +174 reg +175 macr +176 deg +177 plusmn +178 sup2 +179 sup3 +180 acute +181 micro +182 para +183 middot +184 cedil +185 sup1 +186 ordm +187 raquo +188 frac14 +189 frac12 +190 frac34 +191 iquest +192 Agrave +193 Aacute +194 Acirc +195 Atilde +196 Auml +197 Aring +198 AElig +199 Ccedil +200 Egrave +201 Eacute +202 Ecirc +203 Euml +204 Igrave +205 Iacute +206 Icirc +207 Iuml +208 ETH +209 Ntilde +210 Ograve +211 Oacute +212 Ocirc +213 Otilde +214 Ouml +215 times +216 Oslash +217 Ugrave +218 Uacute +219 Ucirc +220 Uuml +221 Yacute +222 THORN +223 szlig +224 agrave +225 aacute +226 acirc +227 atilde +228 auml +229 aring +230 aelig +231 ccedil +232 egrave +233 eacute +234 ecirc +235 euml +236 igrave +237 iacute +238 icirc +239 iuml +240 eth +241 ntilde +242 ograve +243 oacute +244 ocirc +245 otilde +246 ouml +247 divide +248 oslash +249 ugrave +250 uacute +251 ucirc +252 uuml +253 yacute +254 thorn +255 yuml +338 OElig +339 oelig +352 Scaron +353 scaron +376 Yuml +402 fnof +710 circ +732 tilde +913 Alpha +914 Beta +915 Gamma +916 Delta +917 Epsilon +918 Zeta +919 Eta +920 Theta +921 Iota +922 Kappa +923 Lambda +924 Mu +925 Nu +926 Xi +927 Omicron +928 Pi +929 Rho +931 Sigma +932 Tau +933 Upsilon +934 Phi +935 Chi +936 Psi +937 Omega +945 alpha +946 beta +947 gamma +948 delta +949 epsilon +950 zeta +951 eta +952 theta +953 iota +954 kappa +955 lambda +956 mu +957 nu +958 xi +959 omicron +960 pi +961 rho +962 sigmaf +963 sigma +964 tau +965 upsilon +966 phi +967 chi +968 psi +969 omega +977 thetasym +978 upsih +982 piv +8194 ensp +8195 emsp +8201 thinsp +8204 zwnj +8205 zwj +8206 lrm +8207 rlm +8211 ndash +8212 mdash +8216 lsquo +8217 rsquo +8218 sbquo +8220 ldquo +8221 rdquo +8222 bdquo +8224 dagger +8225 Dagger +8226 bull +8230 hellip +8240 permil +8242 prime +8243 Prime +8249 lsaquo +8250 rsaquo +8254 oline +8260 frasl +8364 euro +8465 image +8472 weierp +8476 real +8482 trade +8501 alefsym +8592 larr +8593 uarr +8594 rarr +8595 darr +8596 harr +8629 crarr +8656 lArr +8657 uArr +8658 rArr +8659 dArr +8660 hArr +8704 forall +8706 part +8707 exist +8709 empty +8711 nabla +8712 isin +8713 notin +8715 ni +8719 prod +8721 sum +8722 minus +8727 lowast +8730 radic +8733 prop +8734 infin +8736 ang +8743 and +8744 or +8745 cap +8746 cup +8747 int +8756 there4 +8764 sim +8773 cong +8776 asymp +8800 ne +8801 equiv +8804 le +8805 ge +8834 sub +8835 sup +8836 nsub +8838 sube +8839 supe +8853 oplus +8855 otimes +8869 perp +8901 sdot +8968 lceil +8969 rceil +8970 lfloor +8971 rfloor +9001 lang +9002 rang +9674 loz +9824 spades +9827 clubs +9829 hearts +9830 diams |