User:Isochrone/AutoEd/unicodify.js

//<syntaxhighlight lang=javascript>
function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes
 
 // Task 1: Replace named html entities with unicode
 
 // Most common replacements
 str = str.replace(/&mdash;/gi, '—');
 str = str.replace(/&ndash;/gi, '–');
 
 // Case insensitive symbols
 if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) {
  //XML and HTML Symbols
  str = str.replace(/&hellip;/gi, '...');
  str = str.replace(/&plus;/gi, '+');
  str = str.replace(/&plusmn;/gi, '±');
  str = str.replace(/&minus;/gi, '−');
  str = str.replace(/&times;/gi, '×');
  str = str.replace(/&divide;/gi, '÷');
  str = str.replace(/&ne;/gi, '≠');
  str = str.replace(/&asymp;/gi, '≈');
  str = str.replace(/&le;/gi, '≤');
  str = str.replace(/&ge;/gi, '≥');
  str = str.replace(/&quot;/gi, '"'); // "
  str = str.replace(/&apos;/gi, "'"); // '
  str = str.replace(/&iexcl;/gi, '¡');
  str = str.replace(/&cent;/gi, '¢');
  str = str.replace(/&pound;/gi, '£');
  str = str.replace(/&curren;/gi, '¤');
  str = str.replace(/&yen;/gi, '¥');
  str = str.replace(/&brvbar;/gi, '¦');
  str = str.replace(/&sect;/gi, '§');
  str = str.replace(/&uml;/gi, '¨');
  str = str.replace(/&copy;/gi, '©');
  str = str.replace(/&ordf;/gi, 'ª');
  str = str.replace(/&laquo;/gi, '«');
  str = str.replace(/&not;/gi, '¬');
  str = str.replace(/&reg;/gi, '®');
  str = str.replace(/&macr;/gi, '¯');
  str = str.replace(/&deg;/gi, '°');
  str = str.replace(/&sup2;/gi, '²');
  str = str.replace(/&sup3;/gi, '³');
  str = str.replace(/&acute;/gi, '´');
  str = str.replace(/&micro;/gi, 'µ');
  str = str.replace(/&para;/gi, '¶');
  str = str.replace(/&middot;/gi, '·');
  str = str.replace(/&cedil;/gi, '¸');
  str = str.replace(/&sup1;/gi, '¹');
  str = str.replace(/&ordm;/gi, 'º');
  str = str.replace(/&raquo;/gi, '»');
  str = str.replace(/&frac14;/gi, '¼');
  str = str.replace(/&frac12;/gi, '½');
  str = str.replace(/&frac34;/gi, '¾');
  str = str.replace(/&iquest;/gi, '¿');
  str = str.replace(/&circ;/gi, 'ˆ');
  str = str.replace(/&tilde;/gi, '˜');
  str = str.replace(/&lsquo;/gi, '‘');
  str = str.replace(/&rsquo;/gi, '’');
  str = str.replace(/&sbquo;/gi, '‚');
  str = str.replace(/&ldquo;/gi, '“');
  str = str.replace(/&rdquo;/gi, '”');
  str = str.replace(/&bdquo;/gi, '„');
  str = str.replace(/&bull;/gi, '•');
  str = str.replace(/&permil;/gi, '‰');
  str = str.replace(/&lsaquo;/gi, '‹');
  str = str.replace(/&rsaquo;/gi, '›');
  str = str.replace(/&oline;/gi, '‾');
  str = str.replace(/&frasl;/gi, '⁄');
  str = str.replace(/&euro;/gi, '€');
  str = str.replace(/&image;/gi, 'ℑ');
  str = str.replace(/&weierp;/gi, '℘');
  str = str.replace(/&real;/gi, 'ℜ');
  str = str.replace(/&trade;/gi, '™');
  str = str.replace(/&alefsym;/gi, 'ℵ');
  str = str.replace(/&crarr;/gi, '↵');
  str = str.replace(/&forall;/gi, '∀');
  str = str.replace(/&part;/gi, '∂');
  str = str.replace(/&exist;/gi, '∃');
  str = str.replace(/&empty;/gi, '∅');
  str = str.replace(/&nabla;/gi, '∇');
  str = str.replace(/&isin;/gi, '∈');
  str = str.replace(/&notin;/gi, '∉');
  str = str.replace(/&ni;/gi, '∋');
  str = str.replace(/&prod;/gi, '∏');
  str = str.replace(/&sum;/gi, '∑');
  str = str.replace(/&lowast;/gi, '∗');
  str = str.replace(/&radic;/gi, '√');
  str = str.replace(/&prop;/gi, '∝');
  str = str.replace(/&infin;/gi, '∞');
  str = str.replace(/&ang;/gi, '∠');
  str = str.replace(/&and;/gi, '∧');
  str = str.replace(/&or;/gi, '∨');
  str = str.replace(/&cap;/gi, '∩');
  str = str.replace(/&cup;/gi, '∪');
  str = str.replace(/&int;/gi, '∫');
  str = str.replace(/&there4;/gi, '∴');
  str = str.replace(/&sim;/gi, '∼');
  str = str.replace(/&cong;/gi, '≅');
  str = str.replace(/&sub;/gi, '⊂');
  str = str.replace(/&sup;/gi, '⊃');
  str = str.replace(/&nsub;/gi, '⊄');
  str = str.replace(/&sube;/gi, '⊆');
  str = str.replace(/&supe;/gi, '⊇');
  str = str.replace(/&oplus;/gi, '⊕');
  str = str.replace(/&otimes;/gi, '⊗');
  str = str.replace(/&perp;/gi, '⊥');
  str = str.replace(/&sdot;/gi, '⋅');
  str = str.replace(/&lceil;/gi, '⌈');
  str = str.replace(/&rceil;/gi, '⌉');
  str = str.replace(/&lfloor;/gi, '⌊');
  str = str.replace(/&rfloor;/gi, '⌋');
  str = str.replace(/&lang;/gi, '〈');
  str = str.replace(/&rang;/gi, '〉');
  str = str.replace(/&loz;/gi, '◊');
  str = str.replace(/&spades;/gi, '♠');
  str = str.replace(/&clubs;/gi, '♣');
  str = str.replace(/&hearts;/gi, '♥');
  str = str.replace(/&diams;/gi, '♦');
}
 
// Uppercase symbols
 if(str.search(/&[A-Z][a-z]+;/) >= 0) {
  //Greek symbols
  str = str.replace(/&Alpha;/g, 'Α');
  str = str.replace(/&Beta;/g, 'Β');
  str = str.replace(/&Gamma;/g, 'Γ');
  str = str.replace(/&Delta;/g, 'Δ');
  str = str.replace(/&Epsilon;/g, 'Ε');
  str = str.replace(/&Zeta;/g, 'Ζ');
  str = str.replace(/&Eta;/g, 'Η');
  str = str.replace(/&Theta;/g, 'Θ');
  str = str.replace(/&Iota;/g, 'Ι');
  str = str.replace(/&Kappa;/g, 'Κ');
  str = str.replace(/&Lambda;/g, 'Λ');
  str = str.replace(/&Mu;/g, 'Μ');
  str = str.replace(/&Nu;/g, 'Ν');
  str = str.replace(/&Xi;/g, 'Ξ');
  str = str.replace(/&Omicron;/g, 'Ο');
  str = str.replace(/&Pi;/g, 'Π');
  str = str.replace(/&Rho;/g, 'Ρ');
  str = str.replace(/&Sigma;/g, 'Σ');
  str = str.replace(/&Tau;/g, 'Τ');
  str = str.replace(/&Upsilon;/g, 'Υ');
  str = str.replace(/&Phi;/g, 'Φ');
  str = str.replace(/&Chi;/g, 'Χ');
  str = str.replace(/&Psi;/g, 'Ψ');
  str = str.replace(/&Omega;/g, 'Ω');
  //Latin symbols
  str = str.replace(/&Agrave;/g, 'À');
  str = str.replace(/&Aacute;/g, 'Á');
  str = str.replace(/&Acirc;/g, 'Â');
  str = str.replace(/&Atilde;/g, 'Ã');
  str = str.replace(/&Auml;/g, 'Ä');
  str = str.replace(/&Aring;/g, 'Å');
  str = str.replace(/&AElig;/g, 'Æ');
  str = str.replace(/&Ccedil;/g, 'Ç');
  str = str.replace(/&Egrave;/g, 'È');
  str = str.replace(/&Eacute;/g, 'É');
  str = str.replace(/&Ecirc;/g, 'Ê');
  str = str.replace(/&Euml;/g, 'Ë');
  str = str.replace(/&Igrave;/g, 'Ì');
  str = str.replace(/&Iacute;/g, 'Í');
  str = str.replace(/&Icirc;/g, 'Î');
  str = str.replace(/&Iuml;/g, 'Ï');
  str = str.replace(/&Ntilde;/g, 'Ñ');
  str = str.replace(/&Ograve;/g, 'Ò');
  str = str.replace(/&Oacute;/g, 'Ó');
  str = str.replace(/&Ocirc;/g, 'Ô');
  str = str.replace(/&Otilde;/g, 'Õ');
  str = str.replace(/&Ouml;/g, 'Ö');
  str = str.replace(/&Oslash;/g, 'Ø');
  str = str.replace(/&Ugrave;/g, 'Ù');
  str = str.replace(/&Uacute;/g, 'Ú');
  str = str.replace(/&Ucirc;/g, 'Û');
  str = str.replace(/&Uuml;/g, 'Ü');
  str = str.replace(/&Yacute;/g, 'Ý');
  str = str.replace(/&Scaron;/g, 'Š');
  str = str.replace(/&Yuml;/g, 'Ÿ');
  //XML and HTML Symbols
  str = str.replace(/&Dagger;/g, '‡');
  str = str.replace(/&Prime;/g, '″');
}
 
// lowercase symbols
 if(str.search(/&[a-z][a-z]+;/) >= 0) {
  //Greek symbols
  str = str.replace(/&alpha;/g, 'α');
  str = str.replace(/&beta;/g, 'β');
  str = str.replace(/&gamma;/g, 'γ');
  str = str.replace(/&delta;/g, 'δ');
  str = str.replace(/&epsilon;/g, 'ε');
  str = str.replace(/&zeta;/g, 'ζ');
  str = str.replace(/&eta;/g, 'η');
  str = str.replace(/&theta;/g, 'θ');
  str = str.replace(/&iota;/g, 'ι');
  str = str.replace(/&kappa;/g, 'κ');
  str = str.replace(/&lambda;/g, 'λ');
  str = str.replace(/&mu;/g, 'μ');
  str = str.replace(/&nu;/g, 'ν');
  str = str.replace(/&xi;/g, 'ξ');
  str = str.replace(/&omicron;/g, 'ο');
  str = str.replace(/&pi;/g, 'π');
  str = str.replace(/&rho;/g, 'ρ');
  str = str.replace(/&sigmaf;/g, 'ς');
  str = str.replace(/&sigma;/g, 'σ');
  str = str.replace(/&tau;/g, 'τ');
  str = str.replace(/&upsilon;/g, 'υ');
  str = str.replace(/&phi;/g, 'φ');
  str = str.replace(/&chi;/g, 'χ');
  str = str.replace(/&psi;/g, 'ψ');
  str = str.replace(/&omega;/g, 'ω');
  str = str.replace(/&thetasym;/g, 'ϑ');
  str = str.replace(/&upsih;/g, 'ϒ');
  str = str.replace(/&piv;/g, 'ϖ');
  //Latin symbols
  str = str.replace(/&szlig;/g, 'ß');
  str = str.replace(/&agrave;/g, 'à');
  str = str.replace(/&aacute;/g, 'á');
  str = str.replace(/&acirc;/g, 'â');
  str = str.replace(/&atilde;/g, 'ã');
  str = str.replace(/&auml;/g, 'ä');
  str = str.replace(/&aring;/g, 'å');
  str = str.replace(/&aelig;/g, 'æ');
  str = str.replace(/&ccedil;/g, 'ç');
  str = str.replace(/&egrave;/g, 'è');
  str = str.replace(/&eacute;/g, 'é');
  str = str.replace(/&ecirc;/g, 'ê');
  str = str.replace(/&euml;/g, 'ë');
  str = str.replace(/&igrave;/g, 'ì');
  str = str.replace(/&iacute;/g, 'í');
  str = str.replace(/&icirc;/g, 'î');
  str = str.replace(/&iuml;/g, 'ï');
  str = str.replace(/&eth;/g, 'ð');
  str = str.replace(/&ntilde;/g, 'ñ');
  str = str.replace(/&ograve;/g, 'ò');
  str = str.replace(/&oacute;/g, 'ó');
  str = str.replace(/&ocirc;/g, 'ô');
  str = str.replace(/&otilde;/g, 'õ');
  str = str.replace(/&ouml;/g, 'ö');
  str = str.replace(/&oslash;/g, 'ø');
  str = str.replace(/&ugrave;/g, 'ù');
  str = str.replace(/&uacute;/g, 'ú');
  str = str.replace(/&ucirc;/g, 'û');
  str = str.replace(/&uuml;/g, 'ü');
  str = str.replace(/&yacute;/g, 'ý');
  str = str.replace(/&thorn;/g, 'þ');
  str = str.replace(/&yuml;/g, 'ÿ');
  str = str.replace(/&oelig;/g, 'œ');
  str = str.replace(/&scaron;/g, 'š');
  str = str.replace(/&fnof;/g, 'ƒ');
  //XML and HTML Symbols
  str = str.replace(/&dagger;/g, '†');
  str = str.replace(/&prime;/g, '′');
 }
 
 // False positives
 // Breaks large amounts of code which discuss programming/scripting.
 // str = str.replace(/&lt;/gi, '<');
 // str = str.replace(/&gt;/gi, '>');
 // Breaks large number of URLs and discussion of programming/scripting.
 // str = str.replace(/&amp;/gi, '&');
 
 // Arrows
 str = str.replace(/&larr;/g, '←');
 str = str.replace(/&rarr;/g, '→');
 str = str.replace(/&uarr;/g, '↑');
 str = str.replace(/&darr;/g, '↓');
 str = str.replace(/&lArr;/g, '⇐');
 str = str.replace(/&rArr;/g, '⇒');
 str = str.replace(/&uArr;/g, '⇑');
 str = str.replace(/&dArr;/g, '⇓');
 str = str.replace(/&harr;/g, '↔');
 str = str.replace(/&hArr;/g, '⇔');
 str = str.replace(/<==|<--/gi, '←');
 str = str.replace(/==>/gi, '→');
 
 // Specific case
 str = str.replace(/&ETH;/g, 'Ð');
 str = str.replace(/&THORN;/g, 'Þ');
 str = str.replace(/&OElig;/g, 'Œ');
 
 
 // Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb )
 
 // Symbols for which there may be a good reason to obfuscate/escape
 var dont_replace = "|!{}[]=<>";
 
 // START specialreplace function from User:CharlotteWebb
 function specialreplace(ent, base){
  var chr = "";
  var num = parseInt(ent.replace(/[\&\#\;x]/g, ''), base);
  // see [[UTF-16]] for chars outside the BMP
  // try this with Gothic letters at full volume ^_^
  if (num > 0xFFFF) {
   num -= 0x10000;
   chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF));  
  } else {
   chr = String.fromCharCode(num);
  }
  if (dont_replace.indexOf(chr) == -1) {
   str = str.replace(ent, chr, "gi");
  }
 }
 // END specialreplace function
 
 // perform replacement
 if(m = str.match(/\&\#(\d+)\;/g)) {
  for(i = 0; i < m.length; i++) {
   specialreplace(m[i], 10);
  }
 }
 if(m = str.match(/\&\#x([\da-f]+)\;/gi)) {
  for(i = 0; i < m.length; i++) { 
   specialreplace(m[i], 16);
  }
 }
 
 // Task 3: Unprintable control characters [[Windows-1252]] from User:CharlotteWebb
 var failstr = "<!-- AutoEd: rm unicode ctrl char w/no win-1252 mapping, intent unknown -->";
 str = str.replace(/\u0080/g, '€');
 str = str.replace(/\u0081/g, failstr);
 str = str.replace(/\u0082/g, '‚');
 str = str.replace(/\u0083/g, 'ƒ');
 str = str.replace(/\u0084/g, '„');
 str = str.replace(/\u0085/g, '…');
 str = str.replace(/\u0086/g, '†');
 str = str.replace(/\u0087/g, '‡');
 str = str.replace(/\u0088/g, 'ˆ');
 str = str.replace(/\u0089/g, '‰');
 str = str.replace(/\u008a/g, 'Š');
 str = str.replace(/\u008b/g, '‹');
 str = str.replace(/\u008c/g, 'Œ');
 str = str.replace(/\u008d/g, failstr);
 str = str.replace(/\u008e/g, 'Ž');
 str = str.replace(/\u008f/g, failstr);
 str = str.replace(/\u0090/g, failstr);
 str = str.replace(/\u0091/g, '‘');
 str = str.replace(/\u0092/g, '’');
 str = str.replace(/\u0093/g, '“');
 str = str.replace(/\u0094/g, '”');
 str = str.replace(/\u0095/g, '•');
 str = str.replace(/\u0096/g, '–');
 str = str.replace(/\u0097/g, '—');
 str = str.replace(/\u0098/g, '˜');
 str = str.replace(/\u0099/g, '™');
 str = str.replace(/\u009a/g, 'š');
 str = str.replace(/\u009b/g, '›');
 str = str.replace(/\u009c/g, 'œ');
 str = str.replace(/\u009d/g, failstr);
 str = str.replace(/\u009e/g, 'ž');
 str = str.replace(/\u009f/g, 'Ÿ');
 
 return str;
}
//</syntaxhighlight>

Content Disclaimer

Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.

  1. The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
  2. There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
  3. It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
  4. Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
  5. Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.