#!/bin/env python """latex.py Character translation utilities for LaTeX-formatted text. Usage: - unicode(string,'latex') - ustring.decode('latex') are both available just by letting "import latex" find this file. - unicode(string,'latex+latin1') - ustring.decode('latex+latin1') where latin1 can be replaced by any other known encoding, also become available by calling latex.register(). We also make public a dictionary latex_equivalents, mapping ord(unicode char) to LaTeX code. D. Eppstein, October 2003. source: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252124 License: Python license (http://python.org/doc/Copyright.html) modified for mab2bib 2005/2006 by Henning Hraban Ramm """ #from __future__ import generators import codecs import re from sets import Set def register(): """Enable encodings of the form 'latex+x' where x describes another encoding. Unicode characters are translated to or from x when possible, otherwise expanded to latex. """ codecs.register(_registry) def getregentry(): """Encodings module API.""" return _registry('latex') def _registry(encoding): if encoding == 'latex': encoding = None elif encoding.startswith('latex+'): encoding = encoding[6:] else: return None class Codec(codecs.Codec): def encode(self,input,errors='strict'): """Convert unicode string to latex.""" output = [] for c in input: if encoding: try: output.append(c.encode(encoding)) continue except: pass if ord(c) in latex_equivalents: output.append(latex_equivalents[ord(c)]) else: output += ['{\\char', str(ord(c)), '}'] return ''.join(output), len(input) def decode(self,input,errors='strict'): """Convert latex source string to unicode.""" if encoding: input = unicode(input,encoding,errors) # Note: we may get buffer objects here. # It is not permussable to call join on buffer objects # but we can make them joinable by calling unicode. # This should always be safe since we are supposed # to be producing unicode output anyway. x = map(unicode,_unlatex(input)) return u''.join(x), len(input) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass return (Codec().encode,Codec().decode,StreamReader,StreamWriter) def _tokenize(tex): """Convert latex source into sequence of single-token substrings.""" start = 0 try: # skip quickly across boring stuff pos = _stoppers.finditer(tex).next().span()[0] except StopIteration: yield tex return while 1: if pos > start: yield tex[start:pos] if tex[start] == '\\' and not (tex[pos-1].isdigit() and tex[start+1].isalpha()): while pos < len(tex) and tex[pos].isspace(): # skip blanks after csname pos += 1 while pos < len(tex) and tex[pos] in _ignore: pos += 1 # flush control characters if pos >= len(tex): return start = pos if tex[pos:pos+2] in {'$$':None, '/~':None}: # protect ~ in urls pos += 2 elif tex[pos].isdigit(): while pos < len(tex) and tex[pos].isdigit(): pos += 1 elif tex[pos] == '-': while pos < len(tex) and tex[pos] == '-': pos += 1 elif tex[pos] != '\\' or pos == len(tex) - 1: pos += 1 elif not tex[pos+1].isalpha(): pos += 2 else: pos += 1 while pos < len(tex) and tex[pos].isalpha(): pos += 1 if tex[start:pos] == '\\char' or tex[start:pos] == '\\accent': while pos < len(tex) and tex[pos].isdigit(): pos += 1 class _unlatex: """Convert tokenized tex into sequence of unicode strings. Helper for decode().""" def __iter__(self): """Turn self into an iterator. It already is one, nothing to do.""" return self def __init__(self,tex): """Create a new token converter from a string.""" self.tex = tuple(_tokenize(tex)) # turn tokens into indexable list self.pos = 0 # index of first unprocessed token self.lastoutput = 'x' # lastoutput must always be nonempty string def __getitem__(self,n): """Return token at offset n from current pos.""" p = self.pos + n t = self.tex return p < len(t) and t[p] or None def next(self): """Find and return another piece of converted output.""" if self.pos >= len(self.tex): raise StopIteration nextoutput = self.chunk() if self.lastoutput[0] == '\\' and self.lastoutput[-1].isalpha() and nextoutput[0].isalpha(): nextoutput = ' ' + nextoutput # add extra space to terminate csname self.lastoutput = nextoutput return nextoutput def chunk(self): """Grab another set of input tokens and convert them to an output string.""" for delta,c in self.candidates(0): if c in _l2u: self.pos += delta return unichr(_l2u[c]) elif len(c) == 2 and c[1] == 'i' and (c[0],'\\i') in _l2u: self.pos += delta # correct failure to undot i return unichr(_l2u[(c[0],'\\i')]) elif len(c) == 1 and c[0].startswith('\\char') and c[0][5:].isdigit(): self.pos += delta return unichr(int(c[0][5:])) # nothing matches, just pass through token as-is self.pos += 1 return self[-1] def candidates(self,offset): """Generate pairs delta,c where c is a token or tuple of tokens from tex (after deleting extraneous brackets starting at pos) and delta is the length of the tokens prior to bracket deletion. """ t = self[offset] if t in _blacklist: return elif t == '': for delta,c in self.candidates(offset+1): if self[offset+delta+1] == '}': yield delta+2,c elif t == '\\mbox': for delta,c in self.candidates(offset+1): yield delta+1,c elif t == '$' and self[offset+2] == '$': yield 3, (t,self[offset+1],t) else: q = self[offset+1] if q == '' and self[offset+3] == '}': yield 4, (t,self[offset+2]) elif q: yield 2, (t,q) yield 1, t latex_equivalents = { 0x0009: ' ', 0x000a: '\n', 0x0023: '\#', 0x0026: '\&', 0x00a0: '~', 0x00a1: '!`', 0x00a2: '\\not{c}', 0x00a3: '\\pounds', 0x00a7: '\\S', 0x00a8: '\\"{}', 0x00a9: '\\copyright', 0x00af: '\\={}', 0x00ac: '\\neg', 0x00ad: '\\-', 0x00b0: '\\mbox{$^\\circ$}', 0x00b1: '\\mbox{$\\pm$}', 0x00b2: '\\mbox{$^2$}', 0x00b3: '\\mbox{$^3$}', 0x00b4: "\\'{", 0x00b5: '\\mbox{$\\mu$}', 0x00b6: '\\P', 0x00b7: '\\mbox{$\\cdot$}', 0x00b8: '\\c{}', 0x00b9: '\\mbox{$^1$}', 0x00bf: '?`', 0x00c0: '\\`A', 0x00c1: "\\'A", 0x00c2: '\\^A', 0x00c3: '\\~A', 0x00c4: '\\"A', 0x00c5: '\\AA', 0x00c6: '\\AE', 0x00c7: '\\c{C}', 0x00c8: '\\`E', 0x00c9: "\\'E", 0x00ca: '\\^E', 0x00cb: '\\"E', 0x00cc: '\\`I', 0x00cd: "\\'I", 0x00ce: '\\^I', 0x00cf: '\\"I', 0x00d1: '\\~N', 0x00d2: '\\`O', 0x00d3: "\\'O", 0x00d4: '\\^O', 0x00d5: '\\~O', 0x00d6: '\\"O', 0x00d7: '\\mbox{$\\times$}', 0x00d8: '\\O', 0x00d9: '\\`U', 0x00da: "\\'U", 0x00db: '\\^U', 0x00dc: '\\"U', 0x00dd: "\\'Y", 0x00df: '\\ss', 0x00e0: '\\`a', 0x00e1: "\\'a", 0x00e2: '\\^a', 0x00e3: '\\~a', 0x00e4: '\\"a', 0x00e5: '\\aa', 0x00e6: '\\ae', 0x00e7: '\\c{c}', 0x00e8: '\\`e', 0x00e9: "\\'e", 0x00ea: '\\^e', 0x00eb: '\\"e', 0x00ec: '\\`\\i', 0x00ed: "\\'\\i", 0x00ee: '\\^\\i', 0x00ef: '\\"\\i', 0x00f1: '\\~n', 0x00f2: '\\`o', 0x00f3: "\\'o", 0x00f4: '\\^o', 0x00f5: '\\~o', 0x00f6: '\\"o', 0x00f7: '\\mbox{$\\div$}', 0x00f8: '\\o', 0x00f9: '\\`u', 0x00fa: "\\'u", 0x00fb: '\\^u', 0x00fc: '\\"u', 0x00fd: "\\'y", 0x00ff: '\\"y', 0x0100: '\\=A', 0x0101: '\\=a', 0x0102: '\\u{A}', 0x0103: '\\u{a}', 0x0104: '\\c{A}', 0x0105: '\\c{a}', 0x0106: "\\'C", 0x0107: "\\'c", 0x0108: "\\^C", 0x0109: "\\^c", 0x010a: "\\.C", 0x010b: "\\.c", 0x010c: "\\v{C", 0x010d: "\\v{c", 0x010e: "\\v{D", 0x010f: "\\v{d", 0x0112: '\\=E', 0x0113: '\\=e', 0x0114: '\\u{E}', 0x0115: '\\u{e}', 0x0116: '\\.E', 0x0117: '\\.e', 0x0118: '\\c{E}', 0x0119: '\\c{e}', 0x011a: "\\v{E", 0x011b: "\\v{e", 0x011c: '\\^G', 0x011d: '\\^g', 0x011e: '\\u{G}', 0x011f: '\\u{g}', 0x0120: '\\.G', 0x0121: '\\.g', 0x0122: '\\c{G}', 0x0123: '\\c{g}', 0x0124: '\\^H', 0x0125: '\\^h', 0x0128: '\\~I', 0x0129: '\\~\\i', 0x012a: '\\=I', 0x012b: '\\=\\i', 0x012c: '\\u{I}', 0x012d: '\\u\\i', 0x012e: '\\c{I}', 0x012f: '\\c{i}', 0x0130: '\\.I', 0x0131: '\\i', 0x0132: 'IJ', 0x0133: 'ij', 0x0134: '\\^J', 0x0135: '\\^\\j', 0x0136: '\\c{K}', 0x0137: '\\c{k}', 0x0139: "\\'L", 0x013a: "\\'l", 0x013b: "\\c{L", 0x013c: "\\c{l", 0x013d: "\\v{L", 0x013e: "\\v{l", 0x0141: '\\L', 0x0142: '\\l', 0x0143: "\\'N", 0x0144: "\\'n", 0x0145: "\\c{N", 0x0146: "\\c{n", 0x0147: "\\v{N", 0x0148: "\\v{n", 0x014c: '\\=O', 0x014d: '\\=o', 0x014e: '\\u{O}', 0x014f: '\\u{o}', 0x0150: '\\H{O}', 0x0151: '\\H{o}', 0x0152: '\\OE', 0x0153: '\\oe', 0x0154: "\\'R", 0x0155: "\\'r", 0x0156: "\\c{R", 0x0157: "\\c{r", 0x0158: "\\v{R", 0x0159: "\\v{r", 0x015a: "\\'S", 0x015b: "\\'s", 0x015c: "\\^S", 0x015d: "\\^s", 0x015e: "\\c{S", 0x015f: "\\c{s", 0x0160: "\\v{S", 0x0161: "\\v{s", 0x0162: "\\c{T", 0x0163: "\\c{t", 0x0164: "\\v{T", 0x0165: "\\v{t", 0x0168: "\\~U", 0x0169: "\\~u", 0x016a: "\\=U", 0x016b: "\\=u", 0x016c: "\\u{U", 0x016d: "\\u{u", 0x016e: "\\r{U", 0x016f: "\\r{u", 0x0170: "\\H{U", 0x0171: "\\H{u", 0x0172: "\\c{U", 0x0173: "\\c{u", 0x0174: "\\^W", 0x0175: "\\^w", 0x0176: "\\^Y", 0x0177: "\\^y", 0x0178: '\\"Y', 0x0179: "\\'Z", 0x017a: "\\'Z", 0x017b: "\\.Z", 0x017c: "\\.Z", 0x017d: "\\v{Z", 0x017e: "\\v{z", 0x01c4: "D\\v{Z", 0x01c5: "D\\v{z", 0x01c6: "d\\v{z", 0x01c7: "LJ", 0x01c8: "Lj", 0x01c9: "lj", 0x01ca: "NJ", 0x01cb: "Nj", 0x01cc: "nj", 0x01cd: "\\v{A", 0x01ce: "\\v{a", 0x01cf: "\\v{I", 0x01d0: "\\v\\i", 0x01d1: "\\v{O", 0x01d2: "\\v{o", 0x01d3: "\\v{U", 0x01d4: "\\v{u", 0x01e6: "\\v{G", 0x01e7: "\\v{g", 0x01e8: "\\v{K", 0x01e9: "\\v{k", 0x01ea: "\\c{O", 0x01eb: "\\c{o", 0x01f0: "\\v\\j", 0x01f1: "DZ", 0x01f2: "Dz", 0x01f3: "dz", 0x01f4: "\\'G", 0x01f5: "\\'g", 0x01fc: "\\'\\AE", 0x01fd: "\\'\\ae", 0x01fe: "\\'\\O", 0x01ff: "\\'\\o", 0x02c6: '\\^{}', 0x02dc: '\\~{}', 0x02d8: '\\u{}', 0x02d9: '\\.{}', 0x02da: "\\r{", 0x02dd: '\\H{}', 0x02db: '\\c{}', 0x02c7: '\\v{}', 0x03c0: '\\mbox{$\\pi$}', # consider adding more Greek here 0xfb01: 'fi', 0xfb02: 'fl', 0x2013: '--', 0x2014: '---', 0x2018: "`", 0x2019: "'", 0x201c: "``", 0x201d: "''", 0x2020: "\\dag", 0x2021: "\\ddag", 0x2122: "\\mbox{$^\\mbox{TM}$", 0x2022: "\\mbox{$\\bullet$", 0x2026: "\\ldots", 0x2202: "\\mbox{$\\partial$", 0x220f: "\\mbox{$\\prod$", 0x2211: "\\mbox{$\\sum$", 0x221a: "\\mbox{$\\surd$", 0x221e: "\\mbox{$\\infty$", 0x222b: "\\mbox{$\\int$", 0x2248: "\\mbox{$\\approx$", 0x2260: "\\mbox{$\\neq$", 0x2264: "\\mbox{$\\leq$", 0x2265: "\\mbox{$\\geq$", } for _i in range(0x0020): if _i not in latex_equivalents: latex_equivalents[_i] = '' for _i in range(0x0020,0x007f): if _i not in latex_equivalents: latex_equivalents[_i] = chr(_i) # Characters that should be ignored and not output in tokenization _ignore = Set([chr(i) for i in range(32)+[127]]) - Set('\t\n\r') # Regexp of chars not in blacklist, for quick start of tokenize _stoppers = re.compile('[\x00-\x1f!$\\-?\\{~\\\\`\']') _blacklist = Set(' \n\r') _blacklist.add(None) # shortcut candidate generation at end of data # Construction of inverse translation table _l2u = { '\ ':ord(' ') # unexpanding space makes no sense in non-TeX contexts } for _tex in latex_equivalents: if _tex <= 0x0020 or (_tex <= 0x007f and len(latex_equivalents[_tex]) <= 1): continue # boring entry _toks = tuple(_tokenize(latex_equivalents[_tex])) if _toks[0] == '' and _toks[-1] == '}': _toks = _toks[1:-1] if _toks[0].isalpha(): continue # don't turn ligatures into single chars if len(_toks) == 1 and (_toks[0] == "'" or _toks[0] == "`"): continue # don't turn ascii quotes into curly quotes if _toks[0] == '\\mbox' and _toks[1] == '' and _toks[-1] == '}': _toks = _toks[2:-1] if len(_toks) == 4 and _toks[1] == '' and _toks[3] == '}': _toks = (_toks[0],_toks[2]) if len(_toks) == 1: _toks = _toks[0] _l2u[_toks] = _tex # Shortcut candidate generation for certain useless candidates: # a character is in _blacklist if it can not be at the start # of any translation in _l2u. We use this to quickly skip through # such characters before getting to more difficult-translate parts. # _blacklist is defined several lines up from here because it must # be defined in order to call _tokenize, however it is safe to # delay filling it out until now. for i in range(0x0020,0x007f): _blacklist.add(chr(i)) _blacklist.remove('{') _blacklist.remove('$') for candidate in _l2u: if isinstance(candidate,tuple): if not candidate or not candidate[0]: continue firstchar = candidate[0][0] else: firstchar = candidate[0] _blacklist.discard(firstchar)