Source code for pymarc.marc8

# This file is part of pymarc. It is subject to the license terms in the
# LICENSE file found in the top-level directory of this distribution and at
# https://opensource.org/licenses/BSD-2-Clause. pymarc may be copied, modified,
# propagated, or distributed according to the terms contained in the LICENSE
# file.

"""Handle MARC-8 files.

see http://www.loc.gov/marc/specifications/speccharmarc8.html
"""

import sys
import unicodedata

from pymarc import marc8_mapping


[docs] def marc8_to_unicode(marc8, hide_utf8_warnings: bool = False) -> str: """Pass in a string, and get back a Unicode object. .. code-block:: python print marc8_to_unicode(record.title()) """ # XXX: might be good to stash away a converter somehow # instead of always re-creating it converter = MARC8ToUnicode(quiet=hide_utf8_warnings) try: return converter.translate(marc8) except IndexError: # convert IndexError into UnicodeDecodeErrors raise UnicodeDecodeError( "marc8_to_unicode", marc8, 0, len(marc8), "invalid multibyte character encoding", ) except TypeError: # convert TypeError into UnicodeDecodeErrors raise UnicodeDecodeError( "marc8_to_unicode", marc8, 0, len(marc8), "invalid multibyte character encoding", )
[docs] class MARC8ToUnicode: """Converts MARC-8 to Unicode. Note that currently, unicode strings aren't normalized, and some codecs (e.g. iso8859-1) will fail on such strings. When I can require python 2.3, this will go away. Warning: MARC-8 EACC (East Asian characters) makes some distinctions which aren't captured in Unicode. The LC tables give the option of mapping such characters either to a Unicode private use area, or a substitute character which (usually) gives the sense. I've picked the second, so this means that the MARC data should be treated as primary and the Unicode data used for display purposes only. (If you know of either of fonts designed for use with LC's private-use Unicode assignments, or of attempts to standardize Unicode characters to allow round-trips from EACC, or if you need the private-use Unicode character translations, please inform me, asl2@pobox.com. """ basic_latin = 0x42 ansel = 0x45 def __init__( self, G0: int = basic_latin, G1: int = ansel, quiet: bool = False ) -> None: """Init.""" self.g0 = G0 self.g0_set = {b"(", b",", b"$"} self.g1 = G1 self.g1_set = {b")", b"-", b"$"} self.quiet = quiet
[docs] def translate(self, marc8_string): """Translate.""" # don't choke on empty marc8_string if not marc8_string: return "" uni_list = [] combinings = [] pos = 0 while pos < len(marc8_string): # http://www.loc.gov/marc/specifications/speccharmarc8.html if marc8_string[pos : pos + 1] == b"\x1b": next_byte = marc8_string[pos + 1 : pos + 2] if next_byte in self.g0_set: if len(marc8_string) >= pos + 3: if ( marc8_string[pos + 2 : pos + 3] == b"," and next_byte == b"$" ): pos += 1 self.g0 = ord(marc8_string[pos + 2 : pos + 3]) pos = pos + 3 continue else: # if there aren't enough remaining characters, readd # the escape character so it doesn't get lost; may # help users diagnose problem records uni_list.append(marc8_string[pos : pos + 1].decode("ascii")) pos += 1 continue elif next_byte in self.g1_set: if marc8_string[pos + 2 : pos + 3] == b"-" and next_byte == b"$": pos += 1 self.g1 = ord(marc8_string[pos + 2 : pos + 3]) pos = pos + 3 continue else: charset = ord(next_byte) if charset in marc8_mapping.CODESETS: self.g0 = charset pos += 2 elif charset == 0x73: self.g0 = self.basic_latin pos += 2 if pos == len(marc8_string): break def is_multibyte(charset): return charset == 0x31 mb_flag = is_multibyte(self.g0) if mb_flag: # conditional check if string longer than pos+3 because of malformed marc8 string if len(marc8_string) < pos + 3: sys.stderr.write( f"Multi-byte position {pos+3} exceeds length of marc8 string {len(marc8_string)}\n" ) code_point = 32 # Sets last character as a blank string else: code_point = ( ord(marc8_string[pos : pos + 1]) * 65536 + ord(marc8_string[pos + 1 : pos + 2]) * 256 + ord(marc8_string[pos + 2 : pos + 3]) ) pos += 3 else: code_point = ord(marc8_string[pos : pos + 1]) pos += 1 if code_point < 0x20 or 0x80 < code_point < 0xA0: uni = chr(code_point) continue try: if code_point > 0x80 and not mb_flag: (uni, cflag) = marc8_mapping.CODESETS[self.g1][code_point] else: (uni, cflag) = marc8_mapping.CODESETS[self.g0][code_point] except KeyError: try: uni = marc8_mapping.ODD_MAP[code_point] uni_list.append(chr(uni)) # we can short circuit because we know these mappings # won't be involved in combinings. (i hope?) continue except KeyError: pass if not self.quiet: sys.stderr.write( f"Unable to parse character 0x{code_point:x} in g0={self.g0} g1={self.g1}\n" ) uni = ord(" ") cflag = False if cflag: combinings.append(chr(uni)) else: uni_list.append(chr(uni)) if len(combinings) > 0: uni_list.extend(combinings) combinings = [] # what to do if combining chars left over? uni_str = "".join(uni_list) return unicodedata.normalize("NFC", uni_str)