1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """string processing utilities for extracting strings with various kinds of delimiters"""
23
24 import logging
25 import htmlentitydefs
26
28 """returns a list of locations where substr occurs in searchin
29 locations are not allowed to overlap"""
30 location = 0
31 locations = []
32 while location != -1:
33 location = searchin.find(substr, location)
34 if location != -1:
35 locations.append(location)
36 location += len(substr)
37 return locations
38
40 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
41 returns tuple of (quoted string with quotes, still in string at end)"""
42
43 instring = startinstring
44 enteredonce = False
45 lenstart = len(startdelim)
46 lenend = len(enddelim)
47 startdelim_places = find_all(source, startdelim)
48 if startdelim == enddelim:
49 enddelim_places = startdelim_places[:]
50 else:
51 enddelim_places = find_all(source, enddelim)
52 if escape is not None:
53 lenescape = len(escape)
54 escape_places = find_all(source, escape)
55 last_escape_pos = -1
56
57 true_escape = False
58 true_escape_places = []
59 for escape_pos in escape_places:
60 if escape_pos - lenescape in escape_places:
61 true_escape = not true_escape
62 else:
63 true_escape = True
64 if true_escape:
65 true_escape_places.append(escape_pos)
66 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
67 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
68 else:
69 enddelim_places = [pos + lenend for pos in enddelim_places]
70
71 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
72 significant_places.sort()
73 extracted = ""
74 lastpos = None
75 for pos in significant_places:
76 if instring and pos in enddelim_places:
77
78 if lastpos == pos - lenstart and lastpos in startdelim_places:
79 continue
80 extracted += source[lastpos:pos]
81 instring = False
82 lastpos = pos
83 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
84 instring = True
85 enteredonce = True
86 lastpos = pos
87 if instring:
88 extracted += source[lastpos:]
89 return (extracted, instring)
90
92 """Calls extract over multiple lines, remembering whether in the string or not"""
93 result = ""
94 instring = 0
95 for line in lines:
96 (string, instring) = extract(line, startdelim, enddelim, escape, instring)
97 result += string
98 if not instring: break
99 return result
100
102 "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping"
103 (string, instring) = extract(source, '"', '"', '\\')
104 return string
105
109
111 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
112 includeescapes can also be a function that takes the whole escaped string and returns the replaced version"""
113 instring = startinstring
114 enteredonce = False
115 lenstart = len(startdelim)
116 lenend = len(enddelim)
117 startdelim_places = find_all(source, startdelim)
118 if startdelim == enddelim:
119 enddelim_places = startdelim_places[:]
120 else:
121 enddelim_places = find_all(source, enddelim)
122
123 if escape is not None:
124 lenescape = len(escape)
125 escape_places = find_all(source, escape)
126 last_escape_pos = -1
127
128 true_escape = False
129 true_escape_places = []
130 for escape_pos in escape_places:
131 if escape_pos - lenescape in escape_places:
132 true_escape = not true_escape
133 else:
134 true_escape = True
135 if true_escape:
136 true_escape_places.append(escape_pos)
137 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
138 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
139 else:
140 enddelim_places = [pos + lenend for pos in enddelim_places]
141
142 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
143 significant_places.sort()
144 extracted = ""
145 lastpos = 0
146 callable_includeescapes = callable(includeescapes)
147 checkescapes = callable_includeescapes or not includeescapes
148 for pos in significant_places:
149 if instring and pos in enddelim_places and lastpos != pos - lenstart:
150 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
151 section = source[section_start:section_end]
152 if escape is not None and checkescapes:
153 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
154 new_section = ""
155 last_epos = 0
156 for epos in escape_list:
157 new_section += section[last_epos:epos]
158 if callable_includeescapes:
159 replace_escape = includeescapes(section[epos:epos+lenescape+1])
160
161 if not isinstance(replace_escape, basestring):
162 if replace_escape:
163 replace_escape = section[epos:epos+lenescape+1]
164 else:
165 replace_escape = section[epos+lenescape:epos+lenescape+1]
166 new_section += replace_escape
167 last_epos = epos + lenescape + 1
168 else:
169 last_epos = epos + lenescape
170 section = new_section + section[last_epos:]
171 extracted += section
172 instring = False
173 lastpos = pos
174 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
175 instring = True
176 enteredonce = True
177 lastpos = pos
178 if instring:
179 section_start = lastpos + len(startdelim)
180 section = source[section_start:]
181 if escape is not None and not includeescapes:
182 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
183 new_section = ""
184 last_epos = 0
185 for epos in escape_list:
186 new_section += section[last_epos:epos]
187 if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]):
188 last_epos = epos
189 else:
190 last_epos = epos + lenescape
191 section = new_section + section[last_epos:]
192 extracted += section
193 return (extracted, instring)
194
196 "Returns the same string, with double quotes escaped with backslash"
197 if escapeescapes:
198 return source.replace('\\', '\\\\').replace('"', '\\"')
199 else:
200 return source.replace('"','\\"')
201
203 "Returns the same string, with single quotes doubled"
204 return source.replace("'","''")
205
207 """encodes source using HTML entities e.g. © -> ©"""
208 output = ""
209 for char in source:
210 charnum = ord(char)
211 if charnum in htmlentitydefs.codepoint2name:
212 output += "&%s;" % htmlentitydefs.codepoint2name[charnum]
213 else:
214 output += str(char)
215 return output
216
218 """decodes source using HTML entities e.g. © -> ©"""
219 output = u""
220 inentity = False
221 for char in source:
222 if char == "&":
223 inentity = True
224 possibleentity = ""
225 continue
226 if inentity:
227 if char == ";":
228 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
229 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
230 inentity = False
231 else:
232 output += "&" + possibleentity + ";"
233 inentity = False
234 elif char == " ":
235 output += "&" + possibleentity + char
236 inentity = False
237 else:
238 possibleentity += char
239 else:
240 output += char
241 return output
242
244 """encodes source in the escaped-unicode encoding used by Java .properties files"""
245 output = u""
246 for char in source:
247 charnum = ord(char)
248 if char in controlchars:
249 output += controlchars[char]
250 elif 0 <= charnum < 128:
251 output += str(char)
252 else:
253 output += u"\\u%04X" % charnum
254 return output
255
257 """encodes source in the escaped-unicode encoding used by Mozilla .properties files"""
258 output = u""
259 for char in source:
260 charnum = ord(char)
261 if char in controlchars:
262 output += controlchars[char]
263 else:
264 output += char
265 return output
266
267 propertyescapes = {
268
269 "\\": "\\", "'": "'", '"': '"',
270
271 "f": "\f", "n": "\n", "r": "\r", "t": "\t",
272 }
273
274 controlchars = {
275
276 "\\": "\\\\",
277 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t"
278 }
279
285
287 """Decodes source from the escaped-unicode encoding used by .properties files.
288
289 Java uses Latin1 by default, and Mozilla uses UTF-8 by default."""
290
291
292
293 output = u""
294 s = 0
295 if isinstance(source, str):
296 source = source.decode(encoding)
297 def unichr2(i):
298 """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character"""
299 if 32 <= i:
300 return unichr(i)
301 elif unichr(i) in controlchars:
302
303
304 return unichr(i)
305 else:
306 return "\\u%04x" % i
307 while s < len(source):
308 c = source[s]
309 if c != '\\':
310 output += c
311 s += 1
312 continue
313 s += 1
314 if s >= len(source):
315
316
317 output += c
318 continue
319 c = source[s]
320 s += 1
321 if c == '\n': pass
322
323 elif c in propertyescapes: output += propertyescapes[c]
324
325
326 elif c in "uU":
327 digits = 4
328 x = 0
329 for digit in range(digits):
330 x <<= 4
331 if s + digit >= len(source):
332 digits = digit
333 break
334 c = source[s+digit].lower()
335 if c.isdigit():
336 x += ord(c) - ord('0')
337 elif c in "abcdef":
338 x += ord(c) - ord('a') + 10
339 else:
340 break
341 s += digits
342 output += unichr2(x)
343 elif c == "N":
344 if source[s] != "{":
345 logging.warn("Invalid named unicode escape: no { after \\N")
346 output += "\\" + c
347 continue
348 s += 1
349 e = source.find("}", s)
350 if e == -1:
351 logging.warn("Invalid named unicode escape: no } after \\N{")
352 output += "\\" + c
353 continue
354 import unicodedata
355 name = source[s:e]
356 output += unicodedata.lookup(name)
357 s = e + 1
358 else:
359 output += c
360 return output
361
363 "Returns a doublequote-delimited quoted string, escaping double quotes with backslash"
364 if isinstance(source, list):
365 firstline = True
366 for line in source:
367 if firstline:
368 newsource = '"' + escapequotes(line, escapeescapes) + '"'
369 firstline = False
370 else:
371 newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"'
372 return newsource
373 else:
374 return '"' + escapequotes(source, escapeescapes) + '"'
375
377 "Returns a doublequote-delimited quoted string, escaping single quotes with themselves"
378 return "'" + escapesinglequotes(source) + "'"
379
386
388 s = string.find(substring)
389 if s != -1:
390 s += len(substring)
391 return s
392
394 return string.rstrip("\r\n")
395
404
407
409 """encodes certain characters in the string using an encode dictionary"""
410 encoded = unencoded
411 for key, value in encodedict.iteritems():
412 if key in encoded:
413 encoded = encoded.replace(key, value)
414 return encoded
415
417 """convert numbers to utf8 codes in the values of a dictionary"""
418 for key, value in d.items():
419 if type(value) == int:
420 d[key] = unichr(value).encode('utf8')
421 return d
422
424 x = ' "this" " is " "a" " test!" '
425 print extract(x, '"', '"', None)
426 print extract(x, '"', '"', '!')
427 print extractwithoutquotes(x, '"', '"', None)
428 print extractwithoutquotes(x, '"', '"', '!')
429 print extractwithoutquotes(x, '"', '"', '!', includeescapes=False)
430
431 if __name__ == '__main__':
432 testcase()
433