1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """Some text manipulation utility functions.
19
20
21
22
23
24
25 :group text formatting: normalize_text, normalize_paragraph, pretty_match,\
26 unquote, colorize_ansi
27 :group text manipulation: searchall, splitstrip
28 :sort: text formatting, text manipulation
29
30 :type ANSI_STYLES: dict(str)
31 :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code
32
33 :type ANSI_COLORS: dict(str)
34 :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code
35
36 :type ANSI_PREFIX: str
37 :var ANSI_PREFIX:
38 ANSI terminal code notifying the start of an ANSI escape sequence
39
40 :type ANSI_END: str
41 :var ANSI_END:
42 ANSI terminal code notifying the end of an ANSI escape sequence
43
44 :type ANSI_RESET: str
45 :var ANSI_RESET:
46 ANSI terminal code resetting format defined by a previous ANSI escape sequence
47 """
48 __docformat__ = "restructuredtext en"
49
50 import sys
51 import re
52 from unicodedata import normalize as _uninormalize
53 try:
54 from os import linesep
55 except ImportError:
56 linesep = '\n'
57
58 from logilab.common.deprecation import deprecated
59
60 MANUAL_UNICODE_MAP = {
61 u'\xa1': u'!',
62 u'\u0142': u'l',
63 u'\u2044': u'/',
64 u'\xc6': u'AE',
65 u'\xa9': u'(c)',
66 u'\xab': u'"',
67 u'\xe6': u'ae',
68 u'\xae': u'(r)',
69 u'\u0153': u'oe',
70 u'\u0152': u'OE',
71 u'\xd8': u'O',
72 u'\xf8': u'o',
73 u'\xbb': u'"',
74 u'\xdf': u'ss',
75 }
76
78 """replace diacritical characters with their corresponding ascii characters
79 """
80 res = []
81 for letter in ustring[:]:
82 try:
83 replacement = MANUAL_UNICODE_MAP[letter]
84 except KeyError:
85 if ord(letter) >= 2**8:
86 if ignorenonascii:
87 continue
88 raise ValueError("can't deal with non-ascii based characters")
89 replacement = _uninormalize('NFD', letter)[0]
90 res.append(replacement)
91 return u''.join(res)
92
94 """remove optional quotes (simple or double) from the string
95
96 :type string: str or unicode
97 :param string: an optionally quoted string
98
99 :rtype: str or unicode
100 :return: the unquoted string (or the input string if it wasn't quoted)
101 """
102 if not string:
103 return string
104 if string[0] in '"\'':
105 string = string[1:]
106 if string[-1] in '"\'':
107 string = string[:-1]
108 return string
109
110
111 _BLANKLINES_RGX = re.compile('\r?\n\r?\n')
112 _NORM_SPACES_RGX = re.compile('\s+')
113
114 -def normalize_text(text, line_len=80, indent='', rest=False):
115 """normalize a text to display it with a maximum line size and
116 optionally arbitrary indentation. Line jumps are normalized but blank
117 lines are kept. The indentation string may be used to insert a
118 comment (#) or a quoting (>) mark for instance.
119
120 :type text: str or unicode
121 :param text: the input text to normalize
122
123 :type line_len: int
124 :param line_len: expected maximum line's length, default to 80
125
126 :type indent: str or unicode
127 :param indent: optional string to use as indentation
128
129 :rtype: str or unicode
130 :return:
131 the input text normalized to fit on lines with a maximized size
132 inferior to `line_len`, and optionally prefixed by an
133 indentation string
134 """
135 if rest:
136 normp = normalize_rest_paragraph
137 else:
138 normp = normalize_paragraph
139 result = []
140 for text in _BLANKLINES_RGX.split(text):
141 result.append(normp(text, line_len, indent))
142 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
143
144
146 """normalize a text to display it with a maximum line size and
147 optionally arbitrary indentation. Line jumps are normalized. The
148 indentation string may be used top insert a comment mark for
149 instance.
150
151 :type text: str or unicode
152 :param text: the input text to normalize
153
154 :type line_len: int
155 :param line_len: expected maximum line's length, default to 80
156
157 :type indent: str or unicode
158 :param indent: optional string to use as indentation
159
160 :rtype: str or unicode
161 :return:
162 the input text normalized to fit on lines with a maximized size
163 inferior to `line_len`, and optionally prefixed by an
164 indentation string
165 """
166 text = _NORM_SPACES_RGX.sub(' ', text)
167 line_len = line_len - len(indent)
168 lines = []
169 while text:
170 aline, text = splittext(text.strip(), line_len)
171 lines.append(indent + aline)
172 return linesep.join(lines)
173
175 """normalize a ReST text to display it with a maximum line size and
176 optionally arbitrary indentation. Line jumps are normalized. The
177 indentation string may be used top insert a comment mark for
178 instance.
179
180 :type text: str or unicode
181 :param text: the input text to normalize
182
183 :type line_len: int
184 :param line_len: expected maximum line's length, default to 80
185
186 :type indent: str or unicode
187 :param indent: optional string to use as indentation
188
189 :rtype: str or unicode
190 :return:
191 the input text normalized to fit on lines with a maximized size
192 inferior to `line_len`, and optionally prefixed by an
193 indentation string
194 """
195 toreport = ''
196 lines = []
197 line_len = line_len - len(indent)
198 for line in text.splitlines():
199 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip())
200 toreport = ''
201 while len(line) > line_len:
202
203 line, toreport = splittext(line, line_len)
204 lines.append(indent + line)
205 if toreport:
206 line = toreport + ' '
207 toreport = ''
208 else:
209 line = ''
210 if line:
211 lines.append(indent + line.strip())
212 return linesep.join(lines)
213
214 -def splittext(text, line_len):
215 """split the given text on space according to the given max line size
216
217 return a 2-uple:
218 * a line <= line_len if possible
219 * the rest of the text which has to be reported on another line
220 """
221 if len(text) <= line_len:
222 return text, ''
223 pos = min(len(text)-1, line_len)
224 while pos > 0 and text[pos] != ' ':
225 pos -= 1
226 if pos == 0:
227 pos = min(len(text), line_len)
228 while len(text) > pos and text[pos] != ' ':
229 pos += 1
230 return text[:pos], text[pos+1:].strip()
231
232
234 """return a list of stripped string by splitting the string given as
235 argument on `sep` (',' by default). Empty string are discarded.
236
237 >>> splitstrip('a, b, c , 4,,')
238 ['a', 'b', 'c', '4']
239 >>> splitstrip('a')
240 ['a']
241 >>>
242
243 :type string: str or unicode
244 :param string: a csv line
245
246 :type sep: str or unicode
247 :param sep: field separator, default to the comma (',')
248
249 :rtype: str or unicode
250 :return: the unquoted string (or the input string if it wasn't quoted)
251 """
252 return [word.strip() for word in string.split(sep) if word.strip()]
253
254 get_csv = deprecated()(splitstrip)
255
256 _BLANK_URE = r'(\s|,)+'
257 _BLANK_RE = re.compile(_BLANK_URE)
258 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))'
259 __UNITS_URE = r'[a-zA-Z]+'
260 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE))
261
262 BYTE_UNITS = {
263 "b": 1,
264 "kb": 1024,
265 "mb": 1024 ** 2,
266 "gb": 1024 ** 3,
267 "tb": 1024 ** 4,
268 }
269
270 TIME_UNITS = {
271 "ms": 0.0001,
272 "s": 1,
273 "min": 60,
274 "h": 60 * 60,
275 "d": 60 * 60 *24,
276 }
277
280 """Parse the string applying the units defined in units
281 (e.g.: "1.5m",{'m',60} -> 80).
282
283 :type string: str or unicode
284 :param string: the string to parse
285
286 :type units: dict (or any object with __getitem__ using basestring key)
287 :param units: a dict mapping a unit string repr to its value
288
289 :type inter: type
290 :param inter: used to parse every intermediate value (need __sum__)
291
292 :type blank_reg: regexp
293 :param blank_reg: should match every blank char to ignore.
294
295 :type value_reg: regexp with "value" and optional "unit" group
296 :param value_reg: match a value and it's unit into the
297 """
298 if inter is None:
299 inter = final
300 string = _BLANK_RE.sub('',string)
301 values = []
302 for match in value_reg.finditer(string):
303 dic = match.groupdict()
304
305
306 lit, unit = dic["value"], dic.get("unit")
307 value = inter(lit)
308 if unit is not None:
309 try:
310 value *= units[unit.lower()]
311 except KeyError:
312 raise KeyError('invalid unit %s. valid units are %s' %
313 (unit, units.keys()))
314 values.append(value)
315 return final(sum(values))
316
317 _LINE_RGX = re.compile('\r\n|\r+|\n')
318
320 """return a string with the match location underlined:
321
322 >>> import re
323 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon')
324 il mange du bacon
325 ^^^^^
326 >>>
327
328 :type match: _sre.SRE_match
329 :param match: object returned by re.match, re.search or re.finditer
330
331 :type string: str or unicode
332 :param string:
333 the string on which the regular expression has been applied to
334 obtain the `match` object
335
336 :type underline_char: str or unicode
337 :param underline_char:
338 character to use to underline the matched section, default to the
339 carret '^'
340
341 :rtype: str or unicode
342 :return:
343 the original string with an inserted line to underline the match
344 location
345 """
346 start = match.start()
347 end = match.end()
348 string = _LINE_RGX.sub(linesep, string)
349 start_line_pos = string.rfind(linesep, 0, start)
350 if start_line_pos == -1:
351 start_line_pos = 0
352 result = []
353 else:
354 result = [string[:start_line_pos]]
355 start_line_pos += len(linesep)
356 offset = start - start_line_pos
357 underline = ' ' * offset + underline_char * (end - start)
358 end_line_pos = string.find(linesep, end)
359 if end_line_pos == -1:
360 string = string[start_line_pos:]
361 result.append(string)
362 result.append(underline)
363 else:
364 end = string[end_line_pos + len(linesep):]
365 string = string[start_line_pos:end_line_pos]
366 result.append(string)
367 result.append(underline)
368 result.append(end)
369 return linesep.join(result).rstrip()
370
371
372
373
374 ANSI_PREFIX = '\033['
375 ANSI_END = 'm'
376 ANSI_RESET = '\033[0m'
377 ANSI_STYLES = {
378 'reset' : "0",
379 'bold' : "1",
380 'italic' : "3",
381 'underline' : "4",
382 'blink' : "5",
383 'inverse' : "7",
384 'strike' : "9",
385 }
386 ANSI_COLORS = {
387 'reset' : "0",
388 'black' : "30",
389 'red' : "31",
390 'green' : "32",
391 'yellow' : "33",
392 'blue' : "34",
393 'magenta' : "35",
394 'cyan' : "36",
395 'white' : "37",
396 }
397
399 """return ansi escape code corresponding to color and style
400
401 :type color: str or None
402 :param color:
403 the color name (see `ANSI_COLORS` for available values)
404 or the color number when 256 colors are available
405
406 :type style: str or None
407 :param style:
408 style string (see `ANSI_COLORS` for available values). To get
409 several style effects at the same time, use a coma as separator.
410
411 :raise KeyError: if an unexistent color or style identifier is given
412
413 :rtype: str
414 :return: the built escape code
415 """
416 ansi_code = []
417 if style:
418 style_attrs = splitstrip(style)
419 for effect in style_attrs:
420 ansi_code.append(ANSI_STYLES[effect])
421 if color:
422 if color.isdigit():
423 ansi_code.extend(['38','5'])
424 ansi_code.append(color)
425 else:
426 ansi_code.append(ANSI_COLORS[color])
427 if ansi_code:
428 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END
429 return ''
430
432 """colorize message by wrapping it with ansi escape codes
433
434 :type msg: str or unicode
435 :param msg: the message string to colorize
436
437 :type color: str or None
438 :param color:
439 the color identifier (see `ANSI_COLORS` for available values)
440
441 :type style: str or None
442 :param style:
443 style string (see `ANSI_COLORS` for available values). To get
444 several style effects at the same time, use a coma as separator.
445
446 :raise KeyError: if an unexistent color or style identifier is given
447
448 :rtype: str or unicode
449 :return: the ansi escaped string
450 """
451
452 if color is None and style is None:
453 return msg
454 escape_code = _get_ansi_code(color, style)
455
456 if escape_code:
457 return '%s%s%s' % (escape_code, msg, ANSI_RESET)
458 return msg
459
460 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'}
461
476