1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """String processing utilities for extracting strings with various kinds
23 of delimiters"""
24
25 import logging
26 import htmlentitydefs
27
28 from translate.misc.typecheck import accepts, returns
32 """Returns a list of locations where substr occurs in searchin
33 locations are not allowed to overlap"""
34 location = 0
35 locations = []
36 while location != -1:
37 location = searchin.find(substr, location)
38 if location != -1:
39 locations.append(location)
40 location += len(substr)
41 return locations
42
46 """Extracts a doublequote-delimited string from a string, allowing for
47 backslash-escaping returns tuple of (quoted string with quotes, still in
48 string at end).
49 """
50
51 instring = startinstring
52 enteredonce = False
53 lenstart = len(startdelim)
54 lenend = len(enddelim)
55 startdelim_places = find_all(source, startdelim)
56 if startdelim == enddelim:
57 enddelim_places = startdelim_places[:]
58 else:
59 enddelim_places = find_all(source, enddelim)
60 if escape is not None:
61 lenescape = len(escape)
62 escape_places = find_all(source, escape)
63
64 true_escape = False
65 true_escape_places = []
66 for escape_pos in escape_places:
67 if escape_pos - lenescape in escape_places:
68 true_escape = not true_escape
69 else:
70 true_escape = True
71 if true_escape:
72 true_escape_places.append(escape_pos)
73 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
74 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
75 else:
76 enddelim_places = [pos + lenend for pos in enddelim_places]
77
78 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
79 significant_places.sort()
80 extracted = ""
81 lastpos = None
82 for pos in significant_places:
83 if instring and pos in enddelim_places:
84
85
86 if lastpos == pos - lenstart and lastpos in startdelim_places:
87 continue
88 extracted += source[lastpos:pos]
89 instring = False
90 lastpos = pos
91 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
92 instring = True
93 enteredonce = True
94 lastpos = pos
95 if instring:
96 extracted += source[lastpos:]
97 return (extracted, instring)
98
103 """Extracts a doublequote-delimited string from a string, allowing for
104 backslash-escaping includeescapes can also be a function that takes the
105 whole escaped string and returns the replaced version.
106 """
107 instring = startinstring
108 enteredonce = False
109 lenstart = len(startdelim)
110 lenend = len(enddelim)
111 startdelim_places = find_all(source, startdelim)
112 if startdelim == enddelim:
113 enddelim_places = startdelim_places[:]
114 else:
115 enddelim_places = find_all(source, enddelim)
116
117 if escape is not None:
118 lenescape = len(escape)
119 escape_places = find_all(source, escape)
120
121 true_escape = False
122 true_escape_places = []
123 for escape_pos in escape_places:
124 if escape_pos - lenescape in escape_places:
125 true_escape = not true_escape
126 else:
127 true_escape = True
128 if true_escape:
129 true_escape_places.append(escape_pos)
130 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
131 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
132 else:
133 enddelim_places = [pos + lenend for pos in enddelim_places]
134
135 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
136 significant_places.sort()
137 extracted = ""
138 lastpos = 0
139 callable_includeescapes = callable(includeescapes)
140 checkescapes = callable_includeescapes or not includeescapes
141 for pos in significant_places:
142 if instring and pos in enddelim_places and lastpos != pos - lenstart:
143 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
144 section = source[section_start:section_end]
145 if escape is not None and checkescapes:
146 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
147 new_section = ""
148 last_epos = 0
149 for epos in escape_list:
150 new_section += section[last_epos:epos]
151 if callable_includeescapes:
152 replace_escape = includeescapes(section[epos:epos + lenescape + 1])
153
154
155 if not isinstance(replace_escape, basestring):
156 if replace_escape:
157 replace_escape = section[epos:epos + lenescape + 1]
158 else:
159 replace_escape = section[epos + lenescape:epos + lenescape + 1]
160 new_section += replace_escape
161 last_epos = epos + lenescape + 1
162 else:
163 last_epos = epos + lenescape
164 section = new_section + section[last_epos:]
165 extracted += section
166 instring = False
167 lastpos = pos
168 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
169 instring = True
170 enteredonce = True
171 lastpos = pos
172 if instring:
173 section_start = lastpos + len(startdelim)
174 section = source[section_start:]
175 if escape is not None and not includeescapes:
176 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
177 new_section = ""
178 last_epos = 0
179 for epos in escape_list:
180 new_section += section[last_epos:epos]
181 if callable_includeescapes and includeescapes(section[epos:epos + lenescape + 1]):
182 last_epos = epos
183 else:
184 last_epos = epos + lenescape
185 section = new_section + section[last_epos:]
186 extracted += section
187 return (extracted, instring)
188
191 "Returns the same string, with double quotes escaped with backslash"
192 if escapeescapes:
193 return source.replace('\\', '\\\\').replace('"', '\\"')
194 else:
195 return source.replace('"', '\\"')
196
199 "Returns the same string, with single quotes doubled"
200 return source.replace("'", "''")
201
206 """encodes source using HTML entities e.g. © -> ©"""
207 output = u""
208 for char in source:
209 charnum = ord(char)
210 if charnum in htmlentitydefs.codepoint2name:
211 output += u"&%s;" % htmlentitydefs.codepoint2name[charnum]
212 else:
213 output += str(char)
214 return output
215
220 """decodes source using HTML entities e.g. © -> ©"""
221 output = u""
222 inentity = False
223 for char in source:
224 if char == "&":
225 inentity = True
226 possibleentity = ""
227 continue
228 if inentity:
229 if char == ";":
230 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
231 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
232 inentity = False
233 else:
234 output += "&" + possibleentity + ";"
235 inentity = False
236 elif char == " ":
237 output += "&" + possibleentity + char
238 inentity = False
239 else:
240 possibleentity += char
241 else:
242 output += char
243 return output
244
249 """Encodes source in the escaped-unicode encoding used by Java
250 .properties files
251 """
252 output = u""
253 if source and source[0] == u" ":
254 output = u"\\"
255 for char in source:
256 charnum = ord(char)
257 if char in controlchars:
258 output += controlchars[char]
259 elif 0 <= charnum < 128:
260 output += str(char)
261 else:
262 output += u"\\u%04X" % charnum
263 return output
264
269 """Encodes source in the escaped-unicode encoding used by Mozilla
270 .properties files.
271 """
272 output = u""
273 for char in source:
274 if char in controlchars:
275 output += controlchars[char]
276 else:
277 output += char
278 return output
279
280 propertyescapes = {
281
282 "\\": "\\", "'": "'", '"': '"',
283
284 "f": "\f", "n": "\n", "r": "\r", "t": "\t",
285 }
286
287 controlchars = {
288
289 "\\": "\\\\",
290 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t",
291 }
299
304 """Decodes source from the escaped-unicode encoding used by .properties
305 files.
306
307 Java uses Latin1 by default, and Mozilla uses UTF-8 by default.
308
309 Since the .decode("unicode-escape") routine decodes everything, and we
310 don't want to we reimplemented the algorithm from Python Objects/unicode.c
311 in Python and modify it to retain escaped control characters.
312 """
313 output = u""
314 s = 0
315
316 def unichr2(i):
317 """Returns a Unicode string of one character with ordinal 32 <= i,
318 otherwise an escaped control character.
319 """
320 if 32 <= i:
321 return unichr(i)
322 elif unichr(i) in controlchars:
323
324
325 return unichr(i)
326 else:
327 return "\\u%04x" % i
328
329 while s < len(source):
330 c = source[s]
331 if c != '\\':
332 output += c
333 s += 1
334 continue
335 s += 1
336 if s >= len(source):
337
338
339 output += c
340 continue
341 c = source[s]
342 s += 1
343 if c == '\n':
344 pass
345
346 elif c in propertyescapes:
347 output += propertyescapes[c]
348
349
350 elif c in "uU":
351 digits = 4
352 x = 0
353 for digit in range(digits):
354 x <<= 4
355 if s + digit >= len(source):
356 digits = digit
357 break
358 c = source[s + digit].lower()
359 if c.isdigit():
360 x += ord(c) - ord('0')
361 elif c in "abcdef":
362 x += ord(c) - ord('a') + 10
363 else:
364 break
365 s += digits
366 output += unichr2(x)
367 elif c == "N":
368 if source[s] != "{":
369 logging.warn("Invalid named unicode escape: no { after \\N")
370 output += "\\" + c
371 continue
372 s += 1
373 e = source.find("}", s)
374 if e == -1:
375 logging.warn("Invalid named unicode escape: no } after \\N{")
376 output += "\\" + c
377 continue
378 import unicodedata
379 name = source[s:e]
380 output += unicodedata.lookup(name)
381 s = e + 1
382 else:
383 output += c
384 return output
385
386
387 -def quotestr(source, escapeescapes=0):
388 """Returns a doublequote-delimited quoted string, escaping double
389 quotes with backslash.
390 """
391 if isinstance(source, list):
392 firstline = True
393 for line in source:
394 if firstline:
395 newsource = '"' + escapequotes(line, escapeescapes) + '"'
396 firstline = False
397 else:
398 newsource = newsource + '\n' + \
399 '"' + escapequotes(line, escapeescapes) + '"'
400 return newsource
401 else:
402 return '"' + escapequotes(source, escapeescapes) + '"'
403
406 """Returns a doublequote-delimited quoted string, escaping single quotes
407 with themselves.
408 """
409 return "'" + escapesinglequotes(source) + "'"
410
411
412 -def findend(string, substring):
413 s = string.find(substring)
414 if s != -1:
415 s += len(substring)
416 return s
417
420 return string.rstrip("\r\n")
421
431
435