Package logilab :: Package common :: Module textutils
[frames] | no frames]

Source Code for Module logilab.common.textutils

  1  # copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-common. 
  5  # 
  6  # logilab-common is free software: you can redistribute it and/or modify it under 
  7  # the terms of the GNU Lesser General Public License as published by the Free 
  8  # Software Foundation, either version 2.1 of the License, or (at your option) any 
  9  # later version. 
 10  # 
 11  # logilab-common is distributed in the hope that it will be useful, but WITHOUT 
 12  # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
 13  # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more 
 14  # details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-common.  If not, see <http://www.gnu.org/licenses/>. 
 18  """Some text manipulation utility functions. 
 19   
 20   
 21   
 22   
 23   
 24   
 25  :group text formatting: normalize_text, normalize_paragraph, pretty_match,\ 
 26  unquote, colorize_ansi 
 27  :group text manipulation: searchall, splitstrip 
 28  :sort: text formatting, text manipulation 
 29   
 30  :type ANSI_STYLES: dict(str) 
 31  :var ANSI_STYLES: dictionary mapping style identifier to ANSI terminal code 
 32   
 33  :type ANSI_COLORS: dict(str) 
 34  :var ANSI_COLORS: dictionary mapping color identifier to ANSI terminal code 
 35   
 36  :type ANSI_PREFIX: str 
 37  :var ANSI_PREFIX: 
 38    ANSI terminal code notifying the start of an ANSI escape sequence 
 39   
 40  :type ANSI_END: str 
 41  :var ANSI_END: 
 42    ANSI terminal code notifying the end of an ANSI escape sequence 
 43   
 44  :type ANSI_RESET: str 
 45  :var ANSI_RESET: 
 46    ANSI terminal code resetting format defined by a previous ANSI escape sequence 
 47  """ 
 48  __docformat__ = "restructuredtext en" 
 49   
 50  import sys 
 51  import re 
 52  from unicodedata import normalize as _uninormalize 
 53  try: 
 54      from os import linesep 
 55  except ImportError: 
 56      linesep = '\n' # gae 
 57   
 58  from logilab.common.deprecation import deprecated 
 59   
 60  MANUAL_UNICODE_MAP = { 
 61      u'\xa1': u'!',    # INVERTED EXCLAMATION MARK 
 62      u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE 
 63      u'\u2044': u'/',  # FRACTION SLASH 
 64      u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE 
 65      u'\xa9': u'(c)',  # COPYRIGHT SIGN 
 66      u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 
 67      u'\xe6': u'ae',   # LATIN SMALL LETTER AE 
 68      u'\xae': u'(r)',  # REGISTERED SIGN 
 69      u'\u0153': u'oe', # LATIN SMALL LIGATURE OE 
 70      u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE 
 71      u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE 
 72      u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE 
 73      u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 
 74      u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S 
 75      } 
 76   
77 -def unormalize(ustring, ignorenonascii=False):
78 """replace diacritical characters with their corresponding ascii characters 79 """ 80 res = [] 81 for letter in ustring[:]: 82 try: 83 replacement = MANUAL_UNICODE_MAP[letter] 84 except KeyError: 85 if ord(letter) >= 2**8: 86 if ignorenonascii: 87 continue 88 raise ValueError("can't deal with non-ascii based characters") 89 replacement = _uninormalize('NFD', letter)[0] 90 res.append(replacement) 91 return u''.join(res)
92
93 -def unquote(string):
94 """remove optional quotes (simple or double) from the string 95 96 :type string: str or unicode 97 :param string: an optionally quoted string 98 99 :rtype: str or unicode 100 :return: the unquoted string (or the input string if it wasn't quoted) 101 """ 102 if not string: 103 return string 104 if string[0] in '"\'': 105 string = string[1:] 106 if string[-1] in '"\'': 107 string = string[:-1] 108 return string
109 110 111 _BLANKLINES_RGX = re.compile('\r?\n\r?\n') 112 _NORM_SPACES_RGX = re.compile('\s+') 113
114 -def normalize_text(text, line_len=80, indent='', rest=False):
115 """normalize a text to display it with a maximum line size and 116 optionally arbitrary indentation. Line jumps are normalized but blank 117 lines are kept. The indentation string may be used to insert a 118 comment (#) or a quoting (>) mark for instance. 119 120 :type text: str or unicode 121 :param text: the input text to normalize 122 123 :type line_len: int 124 :param line_len: expected maximum line's length, default to 80 125 126 :type indent: str or unicode 127 :param indent: optional string to use as indentation 128 129 :rtype: str or unicode 130 :return: 131 the input text normalized to fit on lines with a maximized size 132 inferior to `line_len`, and optionally prefixed by an 133 indentation string 134 """ 135 if rest: 136 normp = normalize_rest_paragraph 137 else: 138 normp = normalize_paragraph 139 result = [] 140 for text in _BLANKLINES_RGX.split(text): 141 result.append(normp(text, line_len, indent)) 142 return ('%s%s%s' % (linesep, indent, linesep)).join(result)
143 144
145 -def normalize_paragraph(text, line_len=80, indent=''):
146 """normalize a text to display it with a maximum line size and 147 optionally arbitrary indentation. Line jumps are normalized. The 148 indentation string may be used top insert a comment mark for 149 instance. 150 151 :type text: str or unicode 152 :param text: the input text to normalize 153 154 :type line_len: int 155 :param line_len: expected maximum line's length, default to 80 156 157 :type indent: str or unicode 158 :param indent: optional string to use as indentation 159 160 :rtype: str or unicode 161 :return: 162 the input text normalized to fit on lines with a maximized size 163 inferior to `line_len`, and optionally prefixed by an 164 indentation string 165 """ 166 text = _NORM_SPACES_RGX.sub(' ', text) 167 line_len = line_len - len(indent) 168 lines = [] 169 while text: 170 aline, text = splittext(text.strip(), line_len) 171 lines.append(indent + aline) 172 return linesep.join(lines)
173
174 -def normalize_rest_paragraph(text, line_len=80, indent=''):
175 """normalize a ReST text to display it with a maximum line size and 176 optionally arbitrary indentation. Line jumps are normalized. The 177 indentation string may be used top insert a comment mark for 178 instance. 179 180 :type text: str or unicode 181 :param text: the input text to normalize 182 183 :type line_len: int 184 :param line_len: expected maximum line's length, default to 80 185 186 :type indent: str or unicode 187 :param indent: optional string to use as indentation 188 189 :rtype: str or unicode 190 :return: 191 the input text normalized to fit on lines with a maximized size 192 inferior to `line_len`, and optionally prefixed by an 193 indentation string 194 """ 195 toreport = '' 196 lines = [] 197 line_len = line_len - len(indent) 198 for line in text.splitlines(): 199 line = toreport + _NORM_SPACES_RGX.sub(' ', line.strip()) 200 toreport = '' 201 while len(line) > line_len: 202 # too long line, need split 203 line, toreport = splittext(line, line_len) 204 lines.append(indent + line) 205 if toreport: 206 line = toreport + ' ' 207 toreport = '' 208 else: 209 line = '' 210 if line: 211 lines.append(indent + line.strip()) 212 return linesep.join(lines)
213
214 -def splittext(text, line_len):
215 """split the given text on space according to the given max line size 216 217 return a 2-uple: 218 * a line <= line_len if possible 219 * the rest of the text which has to be reported on another line 220 """ 221 if len(text) <= line_len: 222 return text, '' 223 pos = min(len(text)-1, line_len) 224 while pos > 0 and text[pos] != ' ': 225 pos -= 1 226 if pos == 0: 227 pos = min(len(text), line_len) 228 while len(text) > pos and text[pos] != ' ': 229 pos += 1 230 return text[:pos], text[pos+1:].strip()
231 232
233 -def splitstrip(string, sep=','):
234 """return a list of stripped string by splitting the string given as 235 argument on `sep` (',' by default). Empty string are discarded. 236 237 >>> splitstrip('a, b, c , 4,,') 238 ['a', 'b', 'c', '4'] 239 >>> splitstrip('a') 240 ['a'] 241 >>> 242 243 :type string: str or unicode 244 :param string: a csv line 245 246 :type sep: str or unicode 247 :param sep: field separator, default to the comma (',') 248 249 :rtype: str or unicode 250 :return: the unquoted string (or the input string if it wasn't quoted) 251 """ 252 return [word.strip() for word in string.split(sep) if word.strip()]
253 254 get_csv = deprecated()(splitstrip) 255 256 _BLANK_URE = r'(\s|,)+' 257 _BLANK_RE = re.compile(_BLANK_URE) 258 __VALUE_URE = r'-?(([0-9]+\.[0-9]*)|((0x?)?[0-9]+))' 259 __UNITS_URE = r'[a-zA-Z]+' 260 _VALUE_RE = re.compile(r'(?P<value>%s)(?P<unit>%s)?'%(__VALUE_URE,__UNITS_URE)) 261 262 BYTE_UNITS = { 263 "b": 1, 264 "kb": 1024, 265 "mb": 1024 ** 2, 266 "gb": 1024 ** 3, 267 "tb": 1024 ** 4, 268 } 269 270 TIME_UNITS = { 271 "ms": 0.0001, 272 "s": 1, 273 "min": 60, 274 "h": 60 * 60, 275 "d": 60 * 60 *24, 276 } 277
278 -def apply_units( string, units, inter=None, final=float, blank_reg=_BLANK_RE, 279 value_reg=_VALUE_RE):
280 """Parse the string applying the units defined in units 281 (e.g.: "1.5m",{'m',60} -> 80). 282 283 :type string: str or unicode 284 :param string: the string to parse 285 286 :type units: dict (or any object with __getitem__ using basestring key) 287 :param units: a dict mapping a unit string repr to its value 288 289 :type inter: type 290 :param inter: used to parse every intermediate value (need __sum__) 291 292 :type blank_reg: regexp 293 :param blank_reg: should match every blank char to ignore. 294 295 :type value_reg: regexp with "value" and optional "unit" group 296 :param value_reg: match a value and it's unit into the 297 """ 298 if inter is None: 299 inter = final 300 string = _BLANK_RE.sub('',string) 301 values = [] 302 for match in value_reg.finditer(string): 303 dic = match.groupdict() 304 #import sys 305 #print >> sys.stderr, dic 306 lit, unit = dic["value"], dic.get("unit") 307 value = inter(lit) 308 if unit is not None: 309 try: 310 value *= units[unit.lower()] 311 except KeyError: 312 raise KeyError('invalid unit %s. valid units are %s' % 313 (unit, units.keys())) 314 values.append(value) 315 return final(sum(values))
316 317 _LINE_RGX = re.compile('\r\n|\r+|\n') 318
319 -def pretty_match(match, string, underline_char='^'):
320 """return a string with the match location underlined: 321 322 >>> import re 323 >>> print pretty_match(re.search('mange', 'il mange du bacon'), 'il mange du bacon') 324 il mange du bacon 325 ^^^^^ 326 >>> 327 328 :type match: _sre.SRE_match 329 :param match: object returned by re.match, re.search or re.finditer 330 331 :type string: str or unicode 332 :param string: 333 the string on which the regular expression has been applied to 334 obtain the `match` object 335 336 :type underline_char: str or unicode 337 :param underline_char: 338 character to use to underline the matched section, default to the 339 carret '^' 340 341 :rtype: str or unicode 342 :return: 343 the original string with an inserted line to underline the match 344 location 345 """ 346 start = match.start() 347 end = match.end() 348 string = _LINE_RGX.sub(linesep, string) 349 start_line_pos = string.rfind(linesep, 0, start) 350 if start_line_pos == -1: 351 start_line_pos = 0 352 result = [] 353 else: 354 result = [string[:start_line_pos]] 355 start_line_pos += len(linesep) 356 offset = start - start_line_pos 357 underline = ' ' * offset + underline_char * (end - start) 358 end_line_pos = string.find(linesep, end) 359 if end_line_pos == -1: 360 string = string[start_line_pos:] 361 result.append(string) 362 result.append(underline) 363 else: 364 end = string[end_line_pos + len(linesep):] 365 string = string[start_line_pos:end_line_pos] 366 result.append(string) 367 result.append(underline) 368 result.append(end) 369 return linesep.join(result).rstrip()
370 371 372 # Ansi colorization ########################################################### 373 374 ANSI_PREFIX = '\033[' 375 ANSI_END = 'm' 376 ANSI_RESET = '\033[0m' 377 ANSI_STYLES = { 378 'reset' : "0", 379 'bold' : "1", 380 'italic' : "3", 381 'underline' : "4", 382 'blink' : "5", 383 'inverse' : "7", 384 'strike' : "9", 385 } 386 ANSI_COLORS = { 387 'reset' : "0", 388 'black' : "30", 389 'red' : "31", 390 'green' : "32", 391 'yellow' : "33", 392 'blue' : "34", 393 'magenta' : "35", 394 'cyan' : "36", 395 'white' : "37", 396 } 397
398 -def _get_ansi_code(color=None, style=None):
399 """return ansi escape code corresponding to color and style 400 401 :type color: str or None 402 :param color: 403 the color name (see `ANSI_COLORS` for available values) 404 or the color number when 256 colors are available 405 406 :type style: str or None 407 :param style: 408 style string (see `ANSI_COLORS` for available values). To get 409 several style effects at the same time, use a coma as separator. 410 411 :raise KeyError: if an unexistent color or style identifier is given 412 413 :rtype: str 414 :return: the built escape code 415 """ 416 ansi_code = [] 417 if style: 418 style_attrs = splitstrip(style) 419 for effect in style_attrs: 420 ansi_code.append(ANSI_STYLES[effect]) 421 if color: 422 if color.isdigit(): 423 ansi_code.extend(['38','5']) 424 ansi_code.append(color) 425 else: 426 ansi_code.append(ANSI_COLORS[color]) 427 if ansi_code: 428 return ANSI_PREFIX + ';'.join(ansi_code) + ANSI_END 429 return ''
430
431 -def colorize_ansi(msg, color=None, style=None):
432 """colorize message by wrapping it with ansi escape codes 433 434 :type msg: str or unicode 435 :param msg: the message string to colorize 436 437 :type color: str or None 438 :param color: 439 the color identifier (see `ANSI_COLORS` for available values) 440 441 :type style: str or None 442 :param style: 443 style string (see `ANSI_COLORS` for available values). To get 444 several style effects at the same time, use a coma as separator. 445 446 :raise KeyError: if an unexistent color or style identifier is given 447 448 :rtype: str or unicode 449 :return: the ansi escaped string 450 """ 451 # If both color and style are not defined, then leave the text as is 452 if color is None and style is None: 453 return msg 454 escape_code = _get_ansi_code(color, style) 455 # If invalid (or unknown) color, don't wrap msg with ansi codes 456 if escape_code: 457 return '%s%s%s' % (escape_code, msg, ANSI_RESET) 458 return msg
459 460 DIFF_STYLE = {'separator': 'cyan', 'remove': 'red', 'add': 'green'} 461
462 -def diff_colorize_ansi(lines, out=sys.stdout, style=DIFF_STYLE):
463 for line in lines: 464 if line[:4] in ('--- ', '+++ '): 465 out.write(colorize_ansi(line, style['separator'])) 466 elif line[0] == '-': 467 out.write(colorize_ansi(line, style['remove'])) 468 elif line[0] == '+': 469 out.write(colorize_ansi(line, style['add'])) 470 elif line[:4] == '--- ': 471 out.write(colorize_ansi(line, style['separator'])) 472 elif line[:4] == '+++ ': 473 out.write(colorize_ansi(line, style['separator'])) 474 else: 475 out.write(line)
476