# This file is part of pymarc. It is subject to the license terms in the
# LICENSE file found in the top-level directory of this distribution and at
# https://opensource.org/licenses/BSD-2-Clause. pymarc may be copied, modified,
# propagated, or distributed according to the terms contained in the LICENSE
# file.
"""Handle MARC-8 files.
see http://www.loc.gov/marc/specifications/speccharmarc8.html
"""
import sys
import unicodedata
from pymarc import marc8_mapping
[docs]
def marc8_to_unicode(marc8, hide_utf8_warnings: bool = False) -> str:
"""Pass in a string, and get back a Unicode object.
.. code-block:: python
print marc8_to_unicode(record.title())
"""
# XXX: might be good to stash away a converter somehow
# instead of always re-creating it
converter = MARC8ToUnicode(quiet=hide_utf8_warnings)
try:
return converter.translate(marc8)
except IndexError:
# convert IndexError into UnicodeDecodeErrors
raise UnicodeDecodeError(
"marc8_to_unicode",
marc8,
0,
len(marc8),
"invalid multibyte character encoding",
)
except TypeError:
# convert TypeError into UnicodeDecodeErrors
raise UnicodeDecodeError(
"marc8_to_unicode",
marc8,
0,
len(marc8),
"invalid multibyte character encoding",
)
[docs]
class MARC8ToUnicode:
"""Converts MARC-8 to Unicode.
Note that currently, unicode strings aren't normalized, and some codecs (e.g.
iso8859-1) will fail on such strings. When I can require python 2.3, this will go
away.
Warning: MARC-8 EACC (East Asian characters) makes some
distinctions which aren't captured in Unicode. The LC tables give
the option of mapping such characters either to a Unicode private
use area, or a substitute character which (usually) gives the
sense. I've picked the second, so this means that the MARC data
should be treated as primary and the Unicode data used for display
purposes only. (If you know of either of fonts designed for use
with LC's private-use Unicode assignments, or of attempts to
standardize Unicode characters to allow round-trips from EACC,
or if you need the private-use Unicode character translations,
please inform me, asl2@pobox.com.
"""
basic_latin = 0x42
ansel = 0x45
def __init__(
self, G0: int = basic_latin, G1: int = ansel, quiet: bool = False
) -> None:
"""Init."""
self.g0 = G0
self.g0_set = {b"(", b",", b"$"}
self.g1 = G1
self.g1_set = {b")", b"-", b"$"}
self.quiet = quiet
[docs]
def translate(self, marc8_string):
"""Translate."""
# don't choke on empty marc8_string
if not marc8_string:
return ""
uni_list = []
combinings = []
pos = 0
while pos < len(marc8_string):
# http://www.loc.gov/marc/specifications/speccharmarc8.html
if marc8_string[pos : pos + 1] == b"\x1b":
next_byte = marc8_string[pos + 1 : pos + 2]
if next_byte in self.g0_set:
if len(marc8_string) >= pos + 3:
if (
marc8_string[pos + 2 : pos + 3] == b","
and next_byte == b"$"
):
pos += 1
self.g0 = ord(marc8_string[pos + 2 : pos + 3])
pos = pos + 3
continue
else:
# if there aren't enough remaining characters, readd
# the escape character so it doesn't get lost; may
# help users diagnose problem records
uni_list.append(marc8_string[pos : pos + 1].decode("ascii"))
pos += 1
continue
elif next_byte in self.g1_set:
if marc8_string[pos + 2 : pos + 3] == b"-" and next_byte == b"$":
pos += 1
self.g1 = ord(marc8_string[pos + 2 : pos + 3])
pos = pos + 3
continue
else:
charset = ord(next_byte)
if charset in marc8_mapping.CODESETS:
self.g0 = charset
pos += 2
elif charset == 0x73:
self.g0 = self.basic_latin
pos += 2
if pos == len(marc8_string):
break
def is_multibyte(charset):
return charset == 0x31
mb_flag = is_multibyte(self.g0)
if mb_flag:
# conditional check if string longer than pos+3 because of malformed marc8 string
if len(marc8_string) < pos + 3:
sys.stderr.write(
f"Multi-byte position {pos + 3} exceeds length of marc8 string {len(marc8_string)}\n"
)
code_point = 32 # Sets last character as a blank string
else:
code_point = (
ord(marc8_string[pos : pos + 1]) * 65536
+ ord(marc8_string[pos + 1 : pos + 2]) * 256
+ ord(marc8_string[pos + 2 : pos + 3])
)
pos += 3
else:
code_point = ord(marc8_string[pos : pos + 1])
pos += 1
if code_point < 0x20 or 0x80 < code_point < 0xA0:
uni = chr(code_point)
continue
try:
if code_point > 0x80 and not mb_flag:
(uni, cflag) = marc8_mapping.CODESETS[self.g1][code_point]
else:
(uni, cflag) = marc8_mapping.CODESETS[self.g0][code_point]
except KeyError:
try:
uni = marc8_mapping.ODD_MAP[code_point]
uni_list.append(chr(uni))
# we can short circuit because we know these mappings
# won't be involved in combinings. (i hope?)
continue
except KeyError:
pass
if not self.quiet:
sys.stderr.write(
f"Unable to parse character 0x{code_point:x} in g0={self.g0} g1={self.g1}\n"
)
uni = ord(" ")
cflag = False
if cflag:
combinings.append(chr(uni))
else:
uni_list.append(chr(uni))
if len(combinings) > 0:
uni_list.extend(combinings)
combinings = []
# what to do if combining chars left over?
uni_str = "".join(uni_list)
return unicodedata.normalize("NFC", uni_str)