Source code for pymarc.reader

# This file is part of pymarc. It is subject to the license terms in the
# LICENSE file found in the top-level directory of this distribution and at
# https://opensource.org/licenses/BSD-2-Clause. pymarc may be copied, modified,
# propagated, or distributed according to the terms contained in the LICENSE
# file.

"""Pymarc Reader."""
import os
import sys
import json

from io import IOBase, BytesIO, StringIO
from typing import Callable, BinaryIO, IO, Iterator, Union, List

from pymarc.constants import END_OF_RECORD
from pymarc import Field, Indicators, Leader, Record, Subfield
from pymarc import exceptions



[docs]
class Reader:
    """A base class for all iterating readers in the pymarc package."""

    def __iter__(self):
        return self




[docs]
class MARCReader(Reader):
    """An iterator class for reading a file of MARC21 records.

    Simple usage:

    .. code-block:: python

        from pymarc import MARCReader

        ## pass in a file object
        reader = MARCReader(open('file.dat', 'rb'))
        for record in reader:
            ...

        ## pass in marc in transmission format
        reader = MARCReader(rawmarc)
        for record in reader:
            ...

    If you would like to have your Record object contain unicode strings
    use the to_unicode parameter:

    .. code-block:: python

        reader = MARCReader(open('file.dat', 'rb'), to_unicode=True)

    This will decode from MARC-8 or utf-8 depending on the value in the
    MARC leader at position 9. Upon serialization of the Record object to MARC21,
    the resulting output will be utf-8 encoded and the value in the MARC leader
    at position 9 will be set appropriately to indicate the change of
    character encoding.

    If you find yourself in the unfortunate position of having data that
    is utf-8 encoded without the leader set appropriately you can use
    the force_utf8 parameter:

    .. code-block:: python

        reader = MARCReader(open('file.dat', 'rb'), to_unicode=True,
            force_utf8=True)

    If you find yourself in the unfortunate position of having data that is
    mostly utf-8 encoded but with a few non-utf-8 characters, you can also use
    the utf8_handling parameter, which takes the same values ('strict',
    'replace', and 'ignore') as the Python Unicode codecs (see
    http://docs.python.org/library/codecs.html for more info).

    Although, it's not legal in MARC-21 to use anything but MARC-8 or UTF-8, but
    if you have a file in incorrect encode and you know what it is, you can
    try to use your encode in parameter "file_encoding".

    MARCReader parses data in a permissive way and gives the user full control
    on what to do in case wrong record is encountered. Whenever any error is
    found reader returns ``None`` instead of regular record object.
    The exception information and corresponding data are available through
    reader.current_exception and reader.current_chunk properties:

    .. code-block:: python

        reader = MARCReader(open('file.dat', 'rb'))
        for record in reader:
            if record is None:
                print(
                    "Current chunk: ",
                    reader.current_chunk,
                    " was ignored because the following exception raised: ",
                    reader.current_exception
                )
            else:
                # do something with record
    """

    _current_chunk = None
    _current_exception = None

    file_handle: IO

    @property
    def current_chunk(self):
        """Current chunk."""
        return self._current_chunk

    @property
    def current_exception(self):
        """Current exception."""
        return self._current_exception

    def __init__(
        self,
        marc_target: Union[BinaryIO, bytes],
        to_unicode: bool = True,
        force_utf8: bool = False,
        hide_utf8_warnings: bool = False,
        utf8_handling: str = "strict",
        file_encoding: str = "iso8859-1",
        permissive: bool = False,
    ) -> None:
        """The constructor to which you can pass either raw marc or a file-like object.

        Basically the argument you pass in should be raw MARC in transmission format or
        an object that responds to read().
        """
        super(MARCReader, self).__init__()
        self.to_unicode = to_unicode
        self.force_utf8 = force_utf8
        self.hide_utf8_warnings = hide_utf8_warnings
        self.utf8_handling = utf8_handling
        self.file_encoding = file_encoding
        self.permissive = permissive
        if isinstance(marc_target, bytes):
            self.file_handle = BytesIO(marc_target)
        else:
            self.file_handle = marc_target


[docs]
    def close(self) -> None:
        """Close the handle."""
        self.file_handle.close()


    def __next__(self):
        """Read and parse the next record."""
        if self._current_exception:
            if isinstance(self._current_exception, exceptions.FatalReaderError):
                raise StopIteration

        self._current_chunk = None
        self._current_exception = None

        self._current_chunk = first5 = self.file_handle.read(5)
        if not first5:
            raise StopIteration

        if len(first5) < 5:
            self._current_exception = exceptions.TruncatedRecord()
            return None

        try:
            length = int(first5)
        except ValueError:
            self._current_exception = exceptions.RecordLengthInvalid()
            return None

        chunk = self.file_handle.read(length - 5)
        chunk = first5 + chunk
        self._current_chunk = chunk

        if len(self._current_chunk) < length:
            self._current_exception = exceptions.TruncatedRecord()
            return None

        if self._current_chunk[-1] != ord(END_OF_RECORD):
            self._current_exception = exceptions.EndOfRecordNotFound()
            return None

        try:
            return Record(
                chunk,
                to_unicode=self.to_unicode,
                force_utf8=self.force_utf8,
                hide_utf8_warnings=self.hide_utf8_warnings,
                utf8_handling=self.utf8_handling,
                file_encoding=self.file_encoding,
            )
        except Exception as ex:
            self._current_exception = ex




[docs]
def map_records(f: Callable, *files: BytesIO) -> None:
    """Applies a given function to each record in a batch.

    You can pass in multiple batches.

    .. code-block:: python

        def print_title(r):
            print(r['245'])
        map_records(print_title, file('marc.dat'))
    """
    for file in files:
        list(map(f, MARCReader(file)))




[docs]
class JSONReader(Reader):
    """JSON Reader."""

    file_handle: IO

    def __init__(
        self,
        marc_target: Union[bytes, str],
        encoding: str = "utf-8",
        stream: bool = False,
    ) -> None:
        """The constructor to which you can pass either raw marc or a file-like object.

        Basically the argument you pass in should be raw JSON in transmission format or
        an object that responds to read().
        """
        self.encoding = encoding
        if isinstance(marc_target, IOBase):
            self.file_handle = marc_target
        else:
            if isinstance(marc_target, str) and os.path.exists(marc_target):
                self.file_handle = open(marc_target, "r")
            else:
                self.file_handle = StringIO(marc_target)  # type: ignore
        if stream:
            sys.stderr.write(
                "Streaming not yet implemented, your data will be loaded into memory\n"
            )
        self.records = json.load(self.file_handle, strict=False)

    def __iter__(self) -> Iterator:
        if hasattr(self.records, "__iter__") and not isinstance(self.records, dict):
            self.iter = iter(self.records)
        else:
            self.iter = iter([self.records])
        return self

    def __next__(self) -> Iterator:
        jobj = next(self.iter)
        rec = Record()
        rec.leader = Leader(jobj["leader"])
        for field in jobj["fields"]:
            k, v = list(field.items())[0]
            if "subfields" in v and hasattr(v, "update"):
                # flatten m-i-j dict to list in pymarc
                subfields: list = []
                for sub in v["subfields"]:
                    for code, value in sub.items():
                        subfields.append(Subfield(code=code, value=value))
                fld = Field(
                    tag=k,
                    subfields=subfields,
                    indicators=Indicators(v["ind1"], v["ind2"]),
                )
            else:
                fld = Field(tag=k, data=v)
            rec.add_field(fld)
        return rec




[docs]
class MARCMakerReader(Reader):
    r"""MARCMaker Reader.

    Converts a MARCMaker textual representation of a Marc 21 record into a pymarc Record.
    see :func:`Record.__str__() <pymarc.record.Record.__str__>` for more information.

    Simple usage:

    .. code-block:: python

        from pymarc import MARCMakerReader

        ## pass in a file object
        reader = MARCMakerReader(open('file.mrk', 'r'))
        for record in reader:
            ...

        ## pass a string
        reader = MARCReader("=LDR xxx\n=022  ##$a0000-0000\n\n=LDR yyy")
        for record in reader:
            ...
    """

    def __init__(self, target: Union[bytes, str], encoding: str = "utf-8") -> None:
        """The constructor to which you can pass either a str or a file-like object."""
        if isinstance(target, IOBase):
            file_handle = target
        else:
            if isinstance(target, str) and os.path.exists(target):
                file_handle = open(target, mode="r", encoding=encoding)
            else:
                file_handle = StringIO(target)  # type: ignore
        file_content = file_handle.read()
        file_handle.close()
        self.records = [record for record in file_content.split("\n\n")]
        self.iter = iter(self.records)

    def _parse_line(self, line: str) -> Union[Leader, Field]:
        """Parse a MARCMaker line.

        A line looks like
            =LDR  00755cam  22002414a 4500
            or
            =008  010314s1999fr||||||||||||||||fre
            or
            =028  00$aSTMA 8007$bTamla Motown Records
        """
        if line[0] != "=":
            raise ValueError('Line should start with a "=".')
        if line[4:6] != "  ":
            raise ValueError(
                "Tag should be separated from the rest of the field by two spaces."
            )
        tag = line[1:4]
        data = line[6:]
        if tag == "LDR":
            return Leader(data)
        elif tag < "010":
            return Field(tag, data=data)
        indicators = Indicators(data[0], data[1])
        # the first $ is ignored to avoid an empty list item after the split
        subfields: List[Subfield] = [
            Subfield(subfield[:1], subfield[1:]) for subfield in data[3:].split("$")
        ]
        return Field(tag, indicators=indicators, subfields=subfields)

    def __next__(self) -> Iterator:
        """Iterate over a record's line to parse its fields."""
        record_txt = next(self.iter)
        record = Record()
        for line in record_txt.splitlines():
            try:
                field = self._parse_line(line)
            except Exception as exc:
                raise exceptions.PymarcException(
                    f'Unable to parse line "{line}"'
                ) from exc
            if isinstance(field, Leader):
                record.leader = field
            else:
                record.add_field(field)
        return record