User:Isochrone/AutoEd/unicodify.js
//<syntaxhighlight lang=javascript>
function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes
// Task 1: Replace named html entities with unicode
// Most common replacements
str = str.replace(/—/gi, '—');
str = str.replace(/–/gi, '–');
// Case insensitive symbols
if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) {
//XML and HTML Symbols
str = str.replace(/…/gi, '...');
str = str.replace(/+/gi, '+');
str = str.replace(/±/gi, '±');
str = str.replace(/−/gi, '−');
str = str.replace(/×/gi, '×');
str = str.replace(/÷/gi, '÷');
str = str.replace(/≠/gi, '≠');
str = str.replace(/≈/gi, '≈');
str = str.replace(/≤/gi, '≤');
str = str.replace(/≥/gi, '≥');
str = str.replace(/"/gi, '"'); // "
str = str.replace(/'/gi, "'"); // '
str = str.replace(/¡/gi, '¡');
str = str.replace(/¢/gi, '¢');
str = str.replace(/£/gi, '£');
str = str.replace(/¤/gi, '¤');
str = str.replace(/¥/gi, '¥');
str = str.replace(/¦/gi, '¦');
str = str.replace(/§/gi, '§');
str = str.replace(/¨/gi, '¨');
str = str.replace(/©/gi, '©');
str = str.replace(/ª/gi, 'ª');
str = str.replace(/«/gi, '«');
str = str.replace(/¬/gi, '¬');
str = str.replace(/®/gi, '®');
str = str.replace(/¯/gi, '¯');
str = str.replace(/°/gi, '°');
str = str.replace(/²/gi, '²');
str = str.replace(/³/gi, '³');
str = str.replace(/´/gi, '´');
str = str.replace(/µ/gi, 'µ');
str = str.replace(/¶/gi, '¶');
str = str.replace(/·/gi, '·');
str = str.replace(/¸/gi, '¸');
str = str.replace(/¹/gi, '¹');
str = str.replace(/º/gi, 'º');
str = str.replace(/»/gi, '»');
str = str.replace(/¼/gi, '¼');
str = str.replace(/½/gi, '½');
str = str.replace(/¾/gi, '¾');
str = str.replace(/¿/gi, '¿');
str = str.replace(/ˆ/gi, 'ˆ');
str = str.replace(/˜/gi, '˜');
str = str.replace(/‘/gi, '‘');
str = str.replace(/’/gi, '’');
str = str.replace(/‚/gi, '‚');
str = str.replace(/“/gi, '“');
str = str.replace(/”/gi, '”');
str = str.replace(/„/gi, '„');
str = str.replace(/•/gi, '•');
str = str.replace(/‰/gi, '‰');
str = str.replace(/‹/gi, '‹');
str = str.replace(/›/gi, '›');
str = str.replace(/‾/gi, '‾');
str = str.replace(/⁄/gi, '⁄');
str = str.replace(/€/gi, '€');
str = str.replace(/ℑ/gi, 'ℑ');
str = str.replace(/℘/gi, '℘');
str = str.replace(/ℜ/gi, 'ℜ');
str = str.replace(/™/gi, '™');
str = str.replace(/ℵ/gi, 'ℵ');
str = str.replace(/↵/gi, '↵');
str = str.replace(/∀/gi, '∀');
str = str.replace(/∂/gi, '∂');
str = str.replace(/∃/gi, '∃');
str = str.replace(/∅/gi, '∅');
str = str.replace(/∇/gi, '∇');
str = str.replace(/∈/gi, '∈');
str = str.replace(/∉/gi, '∉');
str = str.replace(/∋/gi, '∋');
str = str.replace(/∏/gi, '∏');
str = str.replace(/∑/gi, '∑');
str = str.replace(/∗/gi, '∗');
str = str.replace(/√/gi, '√');
str = str.replace(/∝/gi, '∝');
str = str.replace(/∞/gi, '∞');
str = str.replace(/∠/gi, '∠');
str = str.replace(/∧/gi, '∧');
str = str.replace(/∨/gi, '∨');
str = str.replace(/∩/gi, '∩');
str = str.replace(/∪/gi, '∪');
str = str.replace(/∫/gi, '∫');
str = str.replace(/∴/gi, '∴');
str = str.replace(/∼/gi, '∼');
str = str.replace(/≅/gi, '≅');
str = str.replace(/⊂/gi, '⊂');
str = str.replace(/⊃/gi, '⊃');
str = str.replace(/⊄/gi, '⊄');
str = str.replace(/⊆/gi, '⊆');
str = str.replace(/⊇/gi, '⊇');
str = str.replace(/⊕/gi, '⊕');
str = str.replace(/⊗/gi, '⊗');
str = str.replace(/⊥/gi, '⊥');
str = str.replace(/⋅/gi, '⋅');
str = str.replace(/⌈/gi, '⌈');
str = str.replace(/⌉/gi, '⌉');
str = str.replace(/⌊/gi, '⌊');
str = str.replace(/⌋/gi, '⌋');
str = str.replace(/⟨/gi, '〈');
str = str.replace(/⟩/gi, '〉');
str = str.replace(/◊/gi, '◊');
str = str.replace(/♠/gi, '♠');
str = str.replace(/♣/gi, '♣');
str = str.replace(/♥/gi, '♥');
str = str.replace(/♦/gi, '♦');
}
// Uppercase symbols
if(str.search(/&[A-Z][a-z]+;/) >= 0) {
//Greek symbols
str = str.replace(/Α/g, 'Α');
str = str.replace(/Β/g, 'Β');
str = str.replace(/Γ/g, 'Γ');
str = str.replace(/Δ/g, 'Δ');
str = str.replace(/Ε/g, 'Ε');
str = str.replace(/Ζ/g, 'Ζ');
str = str.replace(/Η/g, 'Η');
str = str.replace(/Θ/g, 'Θ');
str = str.replace(/Ι/g, 'Ι');
str = str.replace(/Κ/g, 'Κ');
str = str.replace(/Λ/g, 'Λ');
str = str.replace(/Μ/g, 'Μ');
str = str.replace(/Ν/g, 'Ν');
str = str.replace(/Ξ/g, 'Ξ');
str = str.replace(/Ο/g, 'Ο');
str = str.replace(/Π/g, 'Π');
str = str.replace(/Ρ/g, 'Ρ');
str = str.replace(/Σ/g, 'Σ');
str = str.replace(/Τ/g, 'Τ');
str = str.replace(/Υ/g, 'Υ');
str = str.replace(/Φ/g, 'Φ');
str = str.replace(/Χ/g, 'Χ');
str = str.replace(/Ψ/g, 'Ψ');
str = str.replace(/Ω/g, 'Ω');
//Latin symbols
str = str.replace(/À/g, 'À');
str = str.replace(/Á/g, 'Á');
str = str.replace(/Â/g, 'Â');
str = str.replace(/Ã/g, 'Ã');
str = str.replace(/Ä/g, 'Ä');
str = str.replace(/Å/g, 'Å');
str = str.replace(/Æ/g, 'Æ');
str = str.replace(/Ç/g, 'Ç');
str = str.replace(/È/g, 'È');
str = str.replace(/É/g, 'É');
str = str.replace(/Ê/g, 'Ê');
str = str.replace(/Ë/g, 'Ë');
str = str.replace(/Ì/g, 'Ì');
str = str.replace(/Í/g, 'Í');
str = str.replace(/Î/g, 'Î');
str = str.replace(/Ï/g, 'Ï');
str = str.replace(/Ñ/g, 'Ñ');
str = str.replace(/Ò/g, 'Ò');
str = str.replace(/Ó/g, 'Ó');
str = str.replace(/Ô/g, 'Ô');
str = str.replace(/Õ/g, 'Õ');
str = str.replace(/Ö/g, 'Ö');
str = str.replace(/Ø/g, 'Ø');
str = str.replace(/Ù/g, 'Ù');
str = str.replace(/Ú/g, 'Ú');
str = str.replace(/Û/g, 'Û');
str = str.replace(/Ü/g, 'Ü');
str = str.replace(/Ý/g, 'Ý');
str = str.replace(/Š/g, 'Š');
str = str.replace(/Ÿ/g, 'Ÿ');
//XML and HTML Symbols
str = str.replace(/‡/g, '‡');
str = str.replace(/″/g, '″');
}
// lowercase symbols
if(str.search(/&[a-z][a-z]+;/) >= 0) {
//Greek symbols
str = str.replace(/α/g, 'α');
str = str.replace(/β/g, 'β');
str = str.replace(/γ/g, 'γ');
str = str.replace(/δ/g, 'δ');
str = str.replace(/ε/g, 'ε');
str = str.replace(/ζ/g, 'ζ');
str = str.replace(/η/g, 'η');
str = str.replace(/θ/g, 'θ');
str = str.replace(/ι/g, 'ι');
str = str.replace(/κ/g, 'κ');
str = str.replace(/λ/g, 'λ');
str = str.replace(/μ/g, 'μ');
str = str.replace(/ν/g, 'ν');
str = str.replace(/ξ/g, 'ξ');
str = str.replace(/ο/g, 'ο');
str = str.replace(/π/g, 'π');
str = str.replace(/ρ/g, 'ρ');
str = str.replace(/ς/g, 'ς');
str = str.replace(/σ/g, 'σ');
str = str.replace(/τ/g, 'τ');
str = str.replace(/υ/g, 'υ');
str = str.replace(/φ/g, 'φ');
str = str.replace(/χ/g, 'χ');
str = str.replace(/ψ/g, 'ψ');
str = str.replace(/ω/g, 'ω');
str = str.replace(/ϑ/g, 'ϑ');
str = str.replace(/ϒ/g, 'ϒ');
str = str.replace(/ϖ/g, 'ϖ');
//Latin symbols
str = str.replace(/ß/g, 'ß');
str = str.replace(/à/g, 'à');
str = str.replace(/á/g, 'á');
str = str.replace(/â/g, 'â');
str = str.replace(/ã/g, 'ã');
str = str.replace(/ä/g, 'ä');
str = str.replace(/å/g, 'å');
str = str.replace(/æ/g, 'æ');
str = str.replace(/ç/g, 'ç');
str = str.replace(/è/g, 'è');
str = str.replace(/é/g, 'é');
str = str.replace(/ê/g, 'ê');
str = str.replace(/ë/g, 'ë');
str = str.replace(/ì/g, 'ì');
str = str.replace(/í/g, 'í');
str = str.replace(/î/g, 'î');
str = str.replace(/ï/g, 'ï');
str = str.replace(/ð/g, 'ð');
str = str.replace(/ñ/g, 'ñ');
str = str.replace(/ò/g, 'ò');
str = str.replace(/ó/g, 'ó');
str = str.replace(/ô/g, 'ô');
str = str.replace(/õ/g, 'õ');
str = str.replace(/ö/g, 'ö');
str = str.replace(/ø/g, 'ø');
str = str.replace(/ù/g, 'ù');
str = str.replace(/ú/g, 'ú');
str = str.replace(/û/g, 'û');
str = str.replace(/ü/g, 'ü');
str = str.replace(/ý/g, 'ý');
str = str.replace(/þ/g, 'þ');
str = str.replace(/ÿ/g, 'ÿ');
str = str.replace(/œ/g, 'œ');
str = str.replace(/š/g, 'š');
str = str.replace(/ƒ/g, 'ƒ');
//XML and HTML Symbols
str = str.replace(/†/g, '†');
str = str.replace(/′/g, '′');
}
// False positives
// Breaks large amounts of code which discuss programming/scripting.
// str = str.replace(/</gi, '<');
// str = str.replace(/>/gi, '>');
// Breaks large number of URLs and discussion of programming/scripting.
// str = str.replace(/&/gi, '&');
// Arrows
str = str.replace(/←/g, '←');
str = str.replace(/→/g, '→');
str = str.replace(/↑/g, '↑');
str = str.replace(/↓/g, '↓');
str = str.replace(/⇐/g, '⇐');
str = str.replace(/⇒/g, '⇒');
str = str.replace(/⇑/g, '⇑');
str = str.replace(/⇓/g, '⇓');
str = str.replace(/↔/g, '↔');
str = str.replace(/⇔/g, '⇔');
str = str.replace(/<==|<--/gi, '←');
str = str.replace(/==>/gi, '→');
// Specific case
str = str.replace(/Ð/g, 'Ð');
str = str.replace(/Þ/g, 'Þ');
str = str.replace(/Œ/g, 'Œ');
// Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb )
// Symbols for which there may be a good reason to obfuscate/escape
var dont_replace = "|!{}[]=<>";
// START specialreplace function from User:CharlotteWebb
function specialreplace(ent, base){
var chr = "";
var num = parseInt(ent.replace(/[\&\#\;x]/g, ''), base);
// see [[UTF-16]] for chars outside the BMP
// try this with Gothic letters at full volume ^_^
if (num > 0xFFFF) {
num -= 0x10000;
chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF));
} else {
chr = String.fromCharCode(num);
}
if (dont_replace.indexOf(chr) == -1) {
str = str.replace(ent, chr, "gi");
}
}
// END specialreplace function
// perform replacement
if(m = str.match(/\&\#(\d+)\;/g)) {
for(i = 0; i < m.length; i++) {
specialreplace(m[i], 10);
}
}
if(m = str.match(/\&\#x([\da-f]+)\;/gi)) {
for(i = 0; i < m.length; i++) {
specialreplace(m[i], 16);
}
}
// Task 3: Unprintable control characters [[Windows-1252]] from User:CharlotteWebb
var failstr = "<!-- AutoEd: rm unicode ctrl char w/no win-1252 mapping, intent unknown -->";
str = str.replace(/\u0080/g, '€');
str = str.replace(/\u0081/g, failstr);
str = str.replace(/\u0082/g, '‚');
str = str.replace(/\u0083/g, 'ƒ');
str = str.replace(/\u0084/g, '„');
str = str.replace(/\u0085/g, '…');
str = str.replace(/\u0086/g, '†');
str = str.replace(/\u0087/g, '‡');
str = str.replace(/\u0088/g, 'ˆ');
str = str.replace(/\u0089/g, '‰');
str = str.replace(/\u008a/g, 'Š');
str = str.replace(/\u008b/g, '‹');
str = str.replace(/\u008c/g, 'Œ');
str = str.replace(/\u008d/g, failstr);
str = str.replace(/\u008e/g, 'Ž');
str = str.replace(/\u008f/g, failstr);
str = str.replace(/\u0090/g, failstr);
str = str.replace(/\u0091/g, '‘');
str = str.replace(/\u0092/g, '’');
str = str.replace(/\u0093/g, '“');
str = str.replace(/\u0094/g, '”');
str = str.replace(/\u0095/g, '•');
str = str.replace(/\u0096/g, '–');
str = str.replace(/\u0097/g, '—');
str = str.replace(/\u0098/g, '˜');
str = str.replace(/\u0099/g, '™');
str = str.replace(/\u009a/g, 'š');
str = str.replace(/\u009b/g, '›');
str = str.replace(/\u009c/g, 'œ');
str = str.replace(/\u009d/g, failstr);
str = str.replace(/\u009e/g, 'ž');
str = str.replace(/\u009f/g, 'Ÿ');
return str;
}
//</syntaxhighlight>
Content Disclaimer
Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.
- The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
- There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
- It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
- Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
- Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.