stract_cjk.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #encoding: utf-8
  2. from perfect_hash import perfect_hash
  3. import re, string, os, random
  4. cur_dir = os.path.abspath(os.path.dirname(__file__))
  5. unicode_chinese_re = u'[\u2E80-\u2EFF\u2F00-\u2FDF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF\u3400-\u4DBF\u4DC0-\u4DFF\u4E00-\u9FBF\uF900-\uFAFF\uFE30-\uFE4F\uFF00-\uFFEF]'
  6. match_re = re.compile(unicode_chinese_re)
  7. def _get_font_lib(f):
  8. reading_data = False
  9. data = []
  10. for i in f.readlines():
  11. if i.strip() == 'FONT_BMP_DATA_BEGIN':
  12. reading_data = True
  13. continue
  14. if i.strip() == 'FONT_BMP_DATA_END':
  15. break
  16. if reading_data:
  17. line = [k for k in i.strip().split(',') if k]
  18. data.extend([int(k, 16) for k in line])
  19. return data
  20. class font_lib(object):
  21. def __init__(self, f, width, height, encoding):
  22. self.width = width
  23. self.height = height
  24. self._lib = _get_font_lib(f)
  25. # byte per charactor
  26. self._bpc = (width+7)//8*height
  27. self.encoding = encoding
  28. self._finished_push = False
  29. self.char_dict = {}
  30. def get_char_data(self, char):
  31. #char_gb = char.encode(self.encoding)
  32. # copied from font_hz_bmp.c
  33. sec, idx = [ord(i) - 0xA0 for i in char]
  34. #print 'sec %d, idx %d for' % (sec, idx), char
  35. start = (94 * (sec-1) + (idx-1)) * self._bpc
  36. return self._lib[start:start+self._bpc]
  37. def push_char(self, c):
  38. self.char_dict[c] = self.char_dict.get(c, 0) + 1
  39. def push_file(self, f):
  40. try:
  41. for i in f:
  42. t = re.findall(match_re, unicode(i.decode(self.encoding)))
  43. if t:
  44. for c in t:
  45. self.push_char(c.encode(self.encoding))
  46. except UnicodeDecodeError as e:
  47. try:
  48. print 'error in decoding %s' % f.name
  49. except:
  50. print 'error in decoding string %s' % f
  51. # re-raise the exception and terminate the building process
  52. raise
  53. def _finish_push(self):
  54. if self._finished_push:
  55. return
  56. self._char_li = zip(self.char_dict.keys(), self.char_dict.values())
  57. self._char_li.sort(key=lambda x:x[1], reverse=True)
  58. self._finished_push = True
  59. #for i in self._char_li:
  60. #print i[0], i[1]
  61. def get_hash_map(self):
  62. self._finish_push()
  63. li = []
  64. for i, k in enumerate(self._char_li):
  65. li.append((k[0], i))
  66. return li
  67. def get_new_font_lib(self):
  68. self._finish_push()
  69. dat = []
  70. for c, f in self._char_li:
  71. dat.extend(self.get_char_data(c))
  72. return dat
  73. def finish(self):
  74. return self.get_hash_map(), self.get_new_font_lib()
  75. class mph_options(object):
  76. 'mock object for options'
  77. def __init__(self, verbose=4, delimiter=', ', indent=4, width=80):
  78. self.verbose = verbose
  79. self.delimiter = delimiter
  80. self.indent = indent
  81. self.width = width
  82. def gen_char_mph(font_lib):
  83. template = open(os.path.join(cur_dir, '..', 'common', 'font_mph-tmpl.c'), 'r').read()
  84. opt = mph_options()
  85. hmap, flib = font_lib.finish()
  86. #print 'compact font lib: %d chars included.' % len(hmap)
  87. #for i in hmap:
  88. #print i[0], repr(i[0]), i[1]
  89. code = perfect_hash.generate_code(hmap, template, perfect_hash.Hash2, opt,
  90. extra_subs={
  91. 'width':str(font_lib.width),
  92. 'height':str(font_lib.height),
  93. 'font_data':', '.join([hex(i) for i in flib])})
  94. return code
  95. # {name:[file_name, height, width, encoding, instance]}
  96. _font_map = {'hz16':{'fname':'common/hz16font.c',
  97. 'height':16,
  98. 'width':16,
  99. 'encoding':'GB2312',
  100. 'flib':None},
  101. 'hz12':{'fname':'common/hz12font.c',
  102. 'height':12,
  103. 'width':12,
  104. 'encoding':'GB2312',
  105. 'flib':None}
  106. }
  107. def get_font_lib(name):
  108. if name not in _font_map.keys():
  109. return None
  110. if _font_map[name]['flib'] is None:
  111. _font_map[name]['flib'] = font_lib(open(
  112. os.path.join(cur_dir, '..', _font_map[name]['fname']), 'r'),
  113. _font_map[name]['height'],
  114. _font_map[name]['width'],
  115. _font_map[name]['encoding'])
  116. return _font_map[name]['flib']
  117. def gen_cmp_font_file():
  118. for i in _font_map:
  119. fl = _font_map[i]['flib']
  120. if fl is not None:
  121. code = gen_char_mph(fl)
  122. with open(os.path.join(cur_dir, '..', 'common', 'font_cmp_%s.c' % i), 'w') as f:
  123. f.write(code)
  124. if __name__ == '__main__':
  125. import sys
  126. lib = get_font_lib('hz16')
  127. libn = get_font_lib('hz16')
  128. assert(lib is libn)
  129. lib.push_file(open(sys.argv[1], 'rb'))
  130. hmap, flib = lib.finish()
  131. for i in hmap:
  132. print i[0], i[1]
  133. assert(len(flib) == 32 * len(hmap))
  134. print gen_char_mph(lib)