snippet: view plain - save this
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # This source code is distributed under GNU GPL v2 license
5 # written by Victor Stinner <victor.stinner AT haypocalc.com>
6 # http://www.haypocalc.com/
7 # creatied: 2006-08-14 -- last change: 2007-08-17
8
9 # Convert any unicode string to ASCII string:
10 # - Remove diacriticals
11 # - Replace special letter with similar ASCII character (similar glyph)
12 #
13 # Support greek, cyrillic, some latin letters and some signs.
14
15 from unicodedata import normalize
16
17 UNICODE_TO_ASCII = {
18 # Latin letters
19 u"Æ": u"AE", # U+00C6 (latin capital ligature ae)
20 u"Ø": u"O", # U+00D8 (latin capital letter o with stroke)
21 u"ß": u"ss", # U+00DF (latin small letter sharp s)
22 u"æ": u"ae", # U+00E6 (latin small ligature ae)
23 u"ø": u"o", # U+00F8 (latin small letter o with stroke)
24 u"?": u"l", # U+0142 (latin small letter l with stroke)
25 u"Œ": u"OE", # U+0152 (latin capital ligature oe)
26 u"œ": u"oe", # U+0153 (latin small ligature oe)
27
28 # Various signs
29 u"¡": u"!", # U+00A1 (inverted exclamation mark)
30 u"©": u"(c)", # U+00A9 (copyright sign)
31 u"«": u'"', # U+00AB (left-pointing double angle quotation mark)
32 u"®": u"(r)", # U+00AE (registred sign)
33 u"²": u"2", # U+00B2 (superscript two)
34 u"»": u'"', # U+00BB (right-pointing double angle quotation mark)
35 u"?": u"/", # U+2044 (fraction slash)
36
37 # Greek
38 u"?": u"A", # U+0391 (capital alpha)
39 u"?": u"B", # U+0392 (capital beta)
40 u"?": u"E", # U+0395 (capital epsilon)
41 u"?": u"Z", # U+0396 (capital zeta)
42 u"?": u"H", # U+0397 (capital eta)
43 u"?": u"O", # U+0398 (captial theta)
44 u"?": u"I", # U+0399 (capital iota)
45 u"?": u"K", # U+039A (capital kappa)
46 u"?": u"M", # U+039C (capital mu)
47 u"?": u"N", # U+039D (capital nu)
48 u"?": u"O", # U+039F (capital omicron)
49 u"?": u"P", # U+03A1 (capital rho)
50 u"?": u"T", # U+03A4 (capital tau)
51 u"?": u"Y", # U+03A5 (capital upsilon)
52 u"?": u"X", # U+03A7 (capital chi)
53 u"?": u"a", # U+03B1 (small alpha)
54 u"?": u"b", # U+03B2 (small beta)
55 u"?": u"y", # U+03B2 (small gamma)
56 u"?": u"e", # U+03B5 (small espilon)
57 u"?": u"n", # U+03B7 (small eta)
58 u"?": u"o", # U+03BF (small omicron)
59 u"?": u"p", # U+03C1 (small rho)
60 u"?": u"v", # U+03C1 (small upsilon)
61
62 # Cyrillic
63 u"?": u"I", # U+0406 (capital byelorussian-ukrainian i)
64 u"?": u"J", # U+0408 (capital je)
65 u"?": u"B", # U+0412 (capital ve)
66 u"?": u"E", # U+0415 (capital ie)
67 u"?": u"N", # U+0418 (capital i)
68 u"?": u"3", # U+0417 (capital ze)
69 u"?": u"K", # U+041A (capital ka)
70 u"?": u"M", # U+041C (capital em)
71 u"?": u"H", # U+041D (capital en)
72 u"?": u"O", # U+041E (capital o)
73 u"?": u"P", # U+0420 (capital er)
74 u"?": u"C", # U+0421 (capital es)
75 u"?": u"T", # U+0422 (capital te)
76 u"?": u"Y", # U+0423 (capital u)
77 u"?": u"X", # U+0425 (capital ha)
78 u"?": u"R", # U+042F (capital ya)
79 u"?": u"a", # U+0430 (small a)
80 u"?": u"b", # U+0432 (small ve)
81 u"?": u"e", # U+0435 (small ie)
82 u"?": u"3", # U+0437 (small ze)
83 u"?": u"k", # U+043A (small ka)
84 u"?": u"m", # U+043C (small em)
85 u"?": u"h", # U+043D (small en)
86 u"?": u"o", # U+043E (small o)
87 u"?": u"p", # U+0440 (small er)
88 u"?": u"c", # U+0441 (small es)
89 u"?": u"T", # U+0442 (small te)
90 u"?": u"y", # U+0443 (small u)
91 u"?": u"x", # U+0445 (small ha)
92 u"?": u"R", # U+044F (small ya)
93 u"?": u"i", # U+0456 (small byelorussian-ukrainian i)
94 u"?": u"j", # U+0458 (small je)
95 }
96
97 def unicode2ascii(text, replace=False):
98 """
99 Convert an unicode string (type 'unicode') to ascii string (type 'str').
100 Try to keep same visual result.
101
102 You can specify an ASCII character to replace non-ASCII character
103 in 'replace' argument (eg. replace='?').
104
105 >>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8"))
106 '! He ho " espanol " ! Python'
107 >>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8"))
108 "L'oeuf de laeticia"
109 >>> unicode2ascii(unicode("????????????????????????????", "UTF-8"), u'?')
110 'IEOAB??EZHOIK?NM?OYanay?e?n?'
111 >>> unicode2ascii(unicode("??????????????????????????????????????????", "UTF-8"), u'?')
112 'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk'
113 """
114 assert isinstance(text, unicode)
115 if replace:
116 if isinstance(replace, str):
117 replace = unicode(replace, "latin-1")
118 if not isinstance(replace, unicode) \
119 or len(replace) != 1 \
120 or not (32 <= ord(replace) <= 127):
121 raise ValueError(
122 "invalid replace character (%r): "
123 "need one ascii printable character" % replace)
124
125 ascii = []
126 for char in text:
127 # Remove diacriticals
128 char = normalize("NFKD", char)[0]
129
130 # Known values
131 if char in UNICODE_TO_ASCII:
132 ascii.append(UNICODE_TO_ASCII[char])
133 continue
134
135 if ord(char) <= 127:
136 # Add valid ASCII
137 ascii.append(char)
138 elif replace:
139 # non-ASCII character
140 ascii.append(replace)
141 # else: ignore it
142
143 text = ''.join(ascii)
144 return text.encode("ascii", "strict")
145
146 if __name__ == "__main__":
147 from doctest import testmod
148 from sys import exit
149 failure, total = testmod()
150 if failure:
151 print "%s failure on %s tests" % (failure, total)
152 exit(1)
153 else:
154 print "All tests are OK (count=%s)" % total

2 comments
Smart for char like "copyright" that you convert into "(c)", but poor for regular chars because it doesn't work for accentuated latin letters like é, è, î.... Better to use a collation algorithm to convert é, ê -> e, à,ä --> a, etc...
In Bash, I prefer using konwert, for example like this:
This code translate the utf8 character in the corresponding HTML entity, and the hexadecimal code if the entity name is not available.