diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 820ccdcaa5..d5d9bb4d4d 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -57,12 +57,12 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] use-crypto-lib: ["cryptography"] include: - - python-version: "3.7" + - python-version: "3.8" use-crypto-lib: "pycryptodome" - - python-version: "3.7" + - python-version: "3.8" use-crypto-lib: "none" steps: - name: Update APT packages @@ -83,14 +83,14 @@ jobs: key: cache-downloaded-files - name: Setup Python uses: actions/setup-python@v5 - if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' + if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' with: python-version: ${{ matrix.python-version }} cache: 'pip' cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) uses: actions/setup-python@v5 - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' with: python-version: ${{ matrix.python-version }} allow-prereleases: true @@ -102,11 +102,11 @@ jobs: - name: Install requirements (Python 3) run: | pip install -r requirements/ci.txt - if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' + if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' - name: Remove pycryptodome and cryptography run: | pip uninstall pycryptodome cryptography -y diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9f782ec080..b1a4fb27f3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -12,6 +12,9 @@ on: permissions: contents: write +env: + HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + jobs: build_and_publish: name: Publish a new version @@ -24,7 +27,7 @@ jobs: - name: Extract version from commit message id: extract_version run: | - VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') + VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Extract tag message from commit message @@ -32,7 +35,7 @@ jobs: run: | VERSION="${{ steps.extract_version.outputs.version }}" delimiter="$(openssl rand -hex 8)" - MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" ) + MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) echo "message<<${delimiter}" >> $GITHUB_OUTPUT echo "$MESSAGE" >> $GITHUB_OUTPUT echo "${delimiter}" >> $GITHUB_OUTPUT diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 84f0b6ee43..89fec3b14e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) +* [Gagnon, William G.](https://github.com/williamgagnon) * [Górny, Michał](https://github.com/mgorny) * [Grillo, Miguel](https://github.com/Ineffable22) * [Gutteridge, David H.](https://github.com/dhgutteridge) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9a2d10a611..4729796799 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -3,11 +3,10 @@ from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding -from ._utils import b_, logger_error, logger_warning +from ._utils import logger_error, logger_warning from .generic import ( DecodedStreamObject, DictionaryObject, - IndirectObject, NullObject, StreamObject, ) @@ -259,7 +258,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes: cm: bytes if isinstance(tu, StreamObject): cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) - elif isinstance(tu, str) and tu.startswith("/Identity"): + else: # if (tu is None) or cast(str, tu).startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" if isinstance(cm, str): @@ -448,34 +447,27 @@ def compute_space_width( en: int = cast(int, ft["/LastChar"]) if st > space_code or en < space_code: raise Exception("Not in range") - if w[space_code - st] == 0: + if w[space_code - st].get_object() == 0: raise Exception("null width") - sp_width = w[space_code - st] + sp_width = w[space_code - st].get_object() except Exception: if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore else: # will consider width of char as avg(width)/2 m = 0 cpt = 0 - for x in w: - if x > 0: - m += x + for xx in w: + xx = xx.get_object() + if xx > 0: + m += xx cpt += 1 sp_width = m / max(1, cpt) / 2 - if isinstance(sp_width, IndirectObject): - # According to - # 'Table 122 - Entries common to all font descriptors (continued)' - # the MissingWidth should be a number, but according to #2286 it can - # be an indirect object - obj = sp_width.get_object() - if obj is None or isinstance(obj, NullObject): - return 0.0 - return obj # type: ignore - + if sp_width is None or isinstance(sp_width, NullObject): + sp_width = 0.0 return sp_width diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index d4c5c43c3c..ffbdb7882e 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -49,7 +49,6 @@ from ._page import PageObject, _VirtualList from ._page_labels import index2label as page_index2page_label from ._utils import ( - b_, deprecate_with_replacement, logger_warning, parse_iso8824_date, @@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: - es = zlib.decompress(b_(field._data)) + es = zlib.decompress(field._data) retval[tag] = es return retval diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 5ddd8d0efe..e5cdd9324e 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -43,7 +43,7 @@ rc4_encrypt, ) -from ._utils import b_, logger_warning +from ._utils import logger_warning from .generic import ( ArrayObject, ByteStringObject, @@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) - obj2.set_data(self.stm_crypt.encrypt(b_(obj._data))) + obj2.set_data(self.stm_crypt.encrypt(obj._data)) for key, value in obj.items(): # Dont forget the Stream dict. obj2[key] = self.encrypt_object(value) obj = obj2 @@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: data = self.str_crypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): - obj._data = self.stm_crypt.decrypt(b_(obj._data)) + obj._data = self.stm_crypt.decrypt(obj._data) for key, value in obj.items(): # Dont forget the Stream dict. obj[key] = self.decrypt_object(value) elif isinstance(obj, DictionaryObject): diff --git a/pypdf/_merger.py b/pypdf/_merger.py index 7176a1adf7..a52a354e38 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -46,7 +46,6 @@ from ._utils import ( StrByteType, deprecate_with_replacement, - str_, ) from ._writer import PdfWriter from .constants import GoToActionArguments, TypArguments, TypFitArguments @@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: self.id = id +# transfered from _utils : as this function is only required here +# and merger will be soon deprecated +def str_(b: Any) -> str: # pragma: no cover + if isinstance(b, bytes): + return b.decode("latin-1") + else: + return str(b) # will return b.__str__() if defined + + class PdfMerger: """ Use :class:`PdfWriter` instead. diff --git a/pypdf/_page.py b/pypdf/_page.py index 63038d9d07..48cdeb149f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import sys from decimal import Decimal from pathlib import Path from typing import ( @@ -38,6 +37,7 @@ Iterable, Iterator, List, + Literal, Optional, Sequence, Set, @@ -85,12 +85,6 @@ StreamObject, ) -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - - MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' @@ -852,7 +846,7 @@ def _add_transformation_matrix( FloatObject(e), FloatObject(f), ], - " cm", + b"cm", ], ) return contents @@ -870,7 +864,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]: if isinstance(obj, list): return b"".join(x.get_object().get_data() for x in obj) else: - return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + return cast(EncodedStreamObject, obj).get_data() else: return None @@ -1063,11 +1057,11 @@ def _merge_page( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( @@ -1201,11 +1195,11 @@ def _merge_page_writer( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 9f413660bb..b5fa14879c 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -2,13 +2,7 @@ from abc import abstractmethod from pathlib import Path -from typing import IO, Any, Dict, List, Optional, Tuple, Union - -try: - # Python 3.8+: https://peps.python.org/pep-0586 - from typing import Protocol -except ImportError: - from typing_extensions import Protocol # type: ignore[assignment] +from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union from ._utils import StrByteType, StreamType diff --git a/pypdf/_reader.py b/pypdf/_reader.py index aeababa7b7..7c084107c5 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -51,7 +51,6 @@ from ._utils import ( StrByteType, StreamType, - b_, logger_warning, read_non_whitespace, read_previous_line, @@ -328,7 +327,7 @@ def _get_object_from_stream( assert cast(str, obj_stm["/Type"]) == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) + stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) stream_data.seek(-1, 1) @@ -932,7 +931,7 @@ def _read_pdf15_xref_stream( xrefstream = cast(ContentStream, read_object(stream, self)) assert cast(str, xrefstream["/Type"]) == "/XRef" self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) + stream_data = BytesIO(xrefstream.get_data()) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py index 1be500959c..e7af1b2340 100644 --- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py +++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -1,10 +1,9 @@ """Extract PDF text preserving the layout of the source PDF""" -import sys from itertools import groupby from math import ceil from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict from ..._utils import logger_warning from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS @@ -12,11 +11,6 @@ from ._text_state_manager import TextStateManager from ._text_state_params import TextStateParams -if sys.version_info >= (3, 8): - from typing import Literal, TypedDict -else: - from typing_extensions import Literal, TypedDict - class BTGroup(TypedDict): """ diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index a912fddb27..40655b1b22 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -1,8 +1,9 @@ """Font constants and classes for "layout" mode text operations""" from dataclasses import dataclass, field -from typing import Any, Dict, Sequence, Union +from typing import Any, Dict, Sequence, Union, cast +from ...errors import ParseError from ...generic import IndirectObject from ._font_widths import STANDARD_WIDTHS @@ -58,6 +59,7 @@ def __post_init__(self) -> None: skip_count = 0 _w = d_font.get("/W", []) for idx, w_entry in enumerate(_w): + w_entry = w_entry.get_object() if skip_count: skip_count -= 1 continue @@ -66,13 +68,14 @@ def __post_init__(self) -> None: # warning and or use reader's "strict" to force an ex??? continue # check for format (1): `int [int int int int ...]` - if isinstance(_w[idx + 1], Sequence): - start_idx, width_list = _w[idx : idx + 2] + w_next_entry = _w[idx + 1].get_object() + if isinstance(w_next_entry, Sequence): + start_idx, width_list = w_entry, w_next_entry self.width_map.update( { ord_map[_cidx]: _width for _cidx, _width in zip( - range(start_idx, start_idx + len(width_list), 1), + range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), width_list, ) if _cidx in ord_map @@ -80,18 +83,23 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - if not isinstance(_w[idx + 1], Sequence) and not isinstance( - _w[idx + 2], Sequence - ): - start_idx, stop_idx, const_width = _w[idx : idx + 3] + elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): + start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(start_idx, stop_idx + 1, 1) + for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) if _cidx in ord_map } ) skip_count = 2 + else: + # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions + # while expecting more elements). This raises an IndexError which is sufficient. + raise ParseError( + f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" + ) # pragma: no cover + if not self.width_map and "/BaseFont" in self.font_dictionary: for key in STANDARD_WIDTHS: if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 38c0d67d7a..94d45cf6d7 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -336,34 +336,6 @@ def mark_location(stream: StreamType) -> None: stream.seek(-radius, 1) -B_CACHE: Dict[str, bytes] = {} - - -def b_(s: Union[str, bytes]) -> bytes: - if isinstance(s, bytes): - return s - bc = B_CACHE - if s in bc: - return bc[s] - try: - r = s.encode("latin-1") - if len(s) < 2: - bc[s] = r - return r - except Exception: - r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r - - -def str_(b: Any) -> str: - if isinstance(b, bytes): - return b.decode("latin-1") - else: - return str(b) # will return b.__str__() if defined - - @overload def ord_(b: str) -> int: ... @@ -390,20 +362,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" -def paeth_predictor(left: int, up: int, up_left: int) -> int: - p = left + up - up_left - dist_left = abs(p - left) - dist_up = abs(p - up) - dist_up_left = abs(p - up_left) - - if dist_left <= dist_up and dist_left <= dist_up_left: - return left - elif dist_up <= dist_up_left: - return up - else: - return up_left - - def deprecate(msg: str, stacklevel: int = 3) -> None: warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) @@ -414,12 +372,17 @@ def deprecation(msg: str) -> None: def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature will be removed, but has a replacement.""" - deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4) + deprecate( + f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", + 4, + ) def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature was already removed, but has a replacement.""" - deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.") + deprecation( + f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." + ) def deprecate_no_replacement(name: str, removed_in: str) -> None: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 00b9d498c0..d73c00e3d4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -62,7 +62,6 @@ StrByteType, StreamType, _get_max_pdf_version_header, - b_, deprecate_with_replacement, logger_warning, ) @@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # Hello world! # endstream # endobj - + if isinstance(data, str): + data = data.encode("latin-1") file_entry = DecodedStreamObject() - file_entry.set_data(b_(data)) + file_entry.set_data(data) file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) # The Filespec entry diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 45b0c145be..7a3f40d95c 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -2,11 +2,11 @@ import sys from io import BytesIO -from typing import Any, List, Tuple, Union, cast +from typing import Any, List, Literal, Tuple, Union, cast from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces -from .errors import PdfReadError +from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, DecodedStreamObject, @@ -15,13 +15,6 @@ NullObject, ) -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - # PEP 586 introduced typing.Literal with Python 3.8 - # For older Python versions, the backport typing_extensions is necessary: - from typing_extensions import Literal - if sys.version_info[:2] >= (3, 10): from typing import TypeAlias else: @@ -148,9 +141,14 @@ def _extended_image_frombytes( img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] - if len(data) % nb_pix != 0: + data_length = len(data) + if data_length == 0: + raise EmptyImageDataError( + "Data is 0 bytes, cannot process an image from empty data." + ) from exc + if data_length % nb_pix != 0: raise exc - k = nb_pix * len(mode) / len(data) + k = nb_pix * len(mode) / data_length data = b"".join([bytes((x,) * int(k)) for x in data]) img = Image.frombytes(mode, size, data) return img diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py index 4db8dfdbf0..98a222483b 100644 --- a/pypdf/annotations/_markup_annotations.py +++ b/pypdf/annotations/_markup_annotations.py @@ -104,9 +104,9 @@ def __init__( self[NameObject("/Rect")] = RectangleObject(rect) font_str = "font: " - if bold is True: + if bold: font_str = f"{font_str}bold " - if italic is True: + if italic: font_str = f"{font_str}italic " font_str = f"{font_str}{font} {font_size}" font_str = f"{font_str};text-align:left;color:#{font_color}" diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py index dcdb3b0ff8..6272cceee6 100644 --- a/pypdf/annotations/_non_markup_annotations.py +++ b/pypdf/annotations/_non_markup_annotations.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union -from ..constants import AnnotationFlag from ..generic._base import ( BooleanObject, NameObject, @@ -12,8 +11,6 @@ from ..generic._rectangle import RectangleObject from ._base import AnnotationDictionary -DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0) - class Link(AnnotationDictionary): def __init__( diff --git a/pypdf/errors.py b/pypdf/errors.py index c962dec662..ad197ffc11 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" +class EmptyImageDataError(PyPdfError): + """Raised when trying to process an image that has no data.""" + + STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" diff --git a/pypdf/filters.py b/pypdf/filters.py index 137e3603a3..43730cc8e9 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,7 @@ from ._utils import ( WHITESPACES_AS_BYTES, - b_, + deprecate, deprecate_with_replacement, deprecation_no_replacement, logger_warning, @@ -376,20 +376,18 @@ class LZWDecode: """ Taken from: - http://www.java2s.com/Open-Source/Java-Document/PDF/PDF- - Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ class Decoder: + STOP = 257 + CLEARDICT = 256 + def __init__(self, data: bytes) -> None: - self.STOP = 257 - self.CLEARDICT = 256 self.data = data self.bytepos = 0 self.bitpos = 0 - self.dict = [""] * 4096 - for i in range(256): - self.dict[i] = chr(i) + self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256) self.reset_dict() def reset_dict(self) -> None: @@ -416,7 +414,7 @@ def next_code(self) -> int: self.bytepos = self.bytepos + 1 return value - def decode(self) -> str: + def decode(self) -> bytes: """ TIFF 6.0 specification explains in sufficient details the steps to implement the LZW encode() and decode() algorithms. @@ -429,7 +427,7 @@ def decode(self) -> str: PdfReadError: If the stop code is missing """ cW = self.CLEARDICT - baos = "" + baos = b"" while True: pW = cW cW = self.next_code() @@ -444,11 +442,11 @@ def decode(self) -> str: else: if cW < self.dictlen: baos += self.dict[cW] - p = self.dict[pW] + self.dict[cW][0] + p = self.dict[pW] + self.dict[cW][0:1] self.dict[self.dictlen] = p self.dictlen += 1 else: - p = self.dict[pW] + self.dict[pW][0] + p = self.dict[pW] + self.dict[pW][0:1] baos += p self.dict[self.dictlen] = p self.dictlen += 1 @@ -460,11 +458,11 @@ def decode(self) -> str: return baos @staticmethod - def decode( + def _decodeb( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, - ) -> str: + ) -> bytes: """ Decode an LZW encoded data stream. @@ -476,9 +474,28 @@ def decode( decoded data. """ # decode_parms is unused here - return LZWDecode.Decoder(data).decode() + @staticmethod + def decode( + data: bytes, + decode_parms: Optional[DictionaryObject] = None, + **kwargs: Any, + ) -> str: # deprecated + """ + Decode an LZW encoded data stream. + + Args: + data: ``bytes`` or ``str`` text to decode. + decode_parms: a dictionary of parameter values. + + Returns: + decoded data. + """ + # decode_parms is unused here + deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0") + return LZWDecode.Decoder(data).decode().decode("latin-1") + class ASCII85Decode: """Decodes string ASCII85-encoded data into a byte format.""" @@ -651,7 +668,7 @@ def decode( return tiff_header + data -def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject +def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject """ Decode the stream data based on the specified filters. @@ -678,7 +695,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decodparms, (list, tuple)): decodparms = (decodparms,) - data: bytes = b_(stream._data) + data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: for filter_type, params in zip(filters, decodparms): @@ -691,7 +708,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL): data = RunLengthDecode.decode(data) elif filter_type in (FT.LZW_DECODE, FTA.LZW): - data = LZWDecode.decode(data, params) # type: ignore + data = LZWDecode._decodeb(data, params) elif filter_type in (FT.ASCII_85_DECODE, FTA.A85): data = ASCII85Decode.decode(data) elif filter_type == FT.DCT_DECODE: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 2d606b4184..309d389cc8 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -30,18 +30,17 @@ import re from binascii import unhexlify from math import log10 +from struct import iter_unpack from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast from .._codecs import _pdfdoc_encoding_rev from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( StreamType, - b_, deprecate_no_replacement, logger_warning, read_non_whitespace, read_until_regex, - str_, ) from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError @@ -308,6 +307,10 @@ def __getitem__(self, key: Any) -> Any: # items should be extracted from pointed Object return self._get_object_with_check()[key] # type: ignore + def __float__(self) -> str: + # in this case we are looking for the pointed data + return self.get_object().__float__() # type: ignore + def __str__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__str__() @@ -369,10 +372,10 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfRe class FloatObject(float, PdfObject): def __new__( - cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None + cls, value: Any = "0.0", context: Optional[Any] = None ) -> "FloatObject": try: - value = float(str_(value)) + value = float(value) return float.__new__(cls, value) except Exception as e: # If this isn't a valid decimal (happens in malformed PDFs) @@ -599,15 +602,16 @@ def write_to_stream( ) bytearr = self.get_encoded_bytes() stream.write(b"(") - for c in bytearr: - if not chr(c).isalnum() and c != b" ": + for c_ in iter_unpack("c", bytearr): + c = cast(bytes, c_[0]) + if not c.isalnum() and c != b" ": # This: # stream.write(rf"\{c:0>3o}".encode()) # gives # https://github.com/davidhalter/parso/issues/207 - stream.write(("\\%03o" % c).encode()) + stream.write(b"\\%03o" % ord(c)) else: - stream.write(b_(chr(c))) + stream.write(c) stream.write(b")") @@ -710,12 +714,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader def encode_pdfdocencoding(unicode_string: str) -> bytes: - retval = bytearray() - for c in unicode_string: - try: - retval += b_(chr(_pdfdoc_encoding_rev[c])) - except KeyError: - raise UnicodeEncodeError( - "pdfdocencoding", c, -1, -1, "does not exist in translation table" - ) - return bytes(retval) + try: + return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) + except KeyError: + raise UnicodeEncodeError( + "pdfdocencoding", + unicode_string, + -1, + -1, + "does not exist in translation table", + ) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 87d6886742..399836be5f 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -52,7 +52,6 @@ from .._utils import ( WHITESPACES, StreamType, - b_, deprecate_no_replacement, deprecate_with_replacement, logger_warning, @@ -843,7 +842,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None: class StreamObject(DictionaryObject): def __init__(self) -> None: - self._data: Union[bytes, str] = b"" + self._data: bytes = b"" self.decoded_self: Optional[DecodedStreamObject] = None def _clone( @@ -877,7 +876,7 @@ def _clone( pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: return self._data def set_data(self, data: bytes) -> None: @@ -885,7 +884,7 @@ def set_data(self, data: bytes) -> None: def hash_value_data(self) -> bytes: data = super().hash_value_data() - data += b_(self._data) + data += self._data return data def write_to_stream( @@ -955,7 +954,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval[NameObject(SA.FILTER)] = f if params is not None: retval[NameObject(SA.DECODE_PARMS)] = params - retval._data = FlateDecode.encode(b_(self._data), level) + retval._data = FlateDecode.encode(self._data, level) return retval def decode_as_image(self) -> Any: @@ -993,7 +992,7 @@ def __init__(self) -> None: self.decoded_self: Optional[DecodedStreamObject] = None # This overrides the parent method: - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: from ..filters import decode_stream_data if self.decoded_self is not None: @@ -1003,7 +1002,7 @@ def get_data(self) -> Union[bytes, str]: # create decoded object decoded = DecodedStreamObject() - decoded.set_data(b_(decode_stream_data(self))) + decoded.set_data(decode_stream_data(self)) for key, value in list(self.items()): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value @@ -1058,7 +1057,7 @@ def __init__( # The inner list has two elements: # Element 0: List # Element 1: str - self._operations: List[Tuple[Any, Any]] = [] + self._operations: List[Tuple[Any, bytes]] = [] # stream may be a StreamObject or an ArrayObject containing # multiple StreamObjects to be cat'd together. @@ -1069,14 +1068,14 @@ def __init__( if isinstance(stream, ArrayObject): data = b"" for s in stream: - data += b_(s.get_object().get_data()) + data += s.get_object().get_data() if len(data) == 0 or data[-1] != b"\n": data += b"\n" super().set_data(bytes(data)) else: stream_data = stream.get_data() assert stream_data is not None - super().set_data(b_(stream_data)) + super().set_data(stream_data) self.forced_encoding = forced_encoding def clone( @@ -1132,7 +1131,7 @@ def _clone( ignore_fields: """ src_cs = cast("ContentStream", src) - super().set_data(b_(src_cs._data)) + super().set_data(src_cs._data) self.pdf = pdf_dest self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding @@ -1249,10 +1248,10 @@ def get_data(self) -> bytes: for op in operands: op.write_to_stream(new_data) new_data.write(b" ") - new_data.write(b_(operator)) + new_data.write(operator) new_data.write(b"\n") self._data = new_data.getvalue() - return b_(self._data) + return self._data # This overrides the parent method: def set_data(self, data: bytes) -> None: @@ -1262,21 +1261,21 @@ def set_data(self, data: bytes) -> None: @property def operations(self) -> List[Tuple[Any, Any]]: if not self._operations and self._data: - self._parse_content_stream(BytesIO(b_(self._data))) + self._parse_content_stream(BytesIO(self._data)) self._data = b"" return self._operations @operations.setter - def operations(self, operations: List[Tuple[Any, Any]]) -> None: + def operations(self, operations: List[Tuple[Any, bytes]]) -> None: self._operations = operations self._data = b"" def isolate_graphics_state(self) -> None: if self._operations: - self._operations.insert(0, ([], "q")) - self._operations.append(([], "Q")) + self._operations.insert(0, ([], b"q")) + self._operations.append(([], b"Q")) elif self._data: - self._data = b"q\n" + b_(self._data) + b"\nQ\n" + self._data = b"q\n" + self._data + b"\nQ\n" # This overrides the parent method: def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index fdcdc33399..b5ac6632ac 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -2,7 +2,7 @@ from typing import Dict, List, Tuple, Union from .._codecs import _pdfdoc_encoding -from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from .._utils import StreamType, logger_warning, read_non_whitespace from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError from ._base import ByteStringObject, TextStringObject @@ -16,7 +16,7 @@ def read_hex_string_from_stream( forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) - txt = "" + arr = [] x = b"" while True: tok = read_non_whitespace(stream) @@ -26,13 +26,37 @@ def read_hex_string_from_stream( break x += tok if len(x) == 2: - txt += chr(int(x, base=16)) + arr.append(int(x, base=16)) x = b"" if len(x) == 1: x += b"0" - if len(x) == 2: - txt += chr(int(x, base=16)) - return create_string_object(b_(txt), forced_encoding) + if x != b"": + arr.append(int(x, base=16)) + return create_string_object(bytes(arr), forced_encoding) + + +__ESPACE_DICT__ = { + b"n": ord(b"\n"), + b"r": ord(b"\r"), + b"t": ord(b"\t"), + b"b": ord(b"\b"), + b"f": ord(b"\f"), + b"(": ord(b"("), + b")": ord(b")"), + b"/": ord(b"/"), + b"\\": ord(b"\\"), + b" ": ord(b" "), + b"%": ord(b"%"), + b"<": ord(b"<"), + b">": ord(b">"), + b"[": ord(b"["), + b"]": ord(b"]"), + b"#": ord(b"#"), + b"_": ord(b"_"), + b"&": ord(b"&"), + b"$": ord(b"$"), +} +__BACKSLASH_CODE__ = 92 def read_string_from_stream( @@ -54,30 +78,9 @@ def read_string_from_stream( break elif tok == b"\\": tok = stream.read(1) - escape_dict = { - b"n": b"\n", - b"r": b"\r", - b"t": b"\t", - b"b": b"\b", - b"f": b"\f", - b"c": rb"\c", - b"(": b"(", - b")": b")", - b"/": b"/", - b"\\": b"\\", - b" ": b" ", - b"%": b"%", - b"<": b"<", - b">": b">", - b"[": b"[", - b"]": b"]", - b"#": b"#", - b"_": b"_", - b"&": b"&", - b"$": b"$", - } try: - tok = escape_dict[tok] + txt.append(__ESPACE_DICT__[tok]) + continue except KeyError: if b"0" <= tok <= b"7": # "The number ddd may consist of one, two, or three @@ -85,6 +88,7 @@ def read_string_from_stream( # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) + sav = stream.tell() - 1 for _ in range(2): ntok = stream.read(1) if b"0" <= ntok <= b"7": @@ -92,7 +96,13 @@ def read_string_from_stream( else: stream.seek(-1, 1) # ntok has to be analyzed break - tok = b_(chr(int(tok, base=8))) + i = int(tok, base=8) + if i > 255: + txt.append(__BACKSLASH_CODE__) + stream.seek(sav) + else: + txt.append(i) + continue elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the @@ -102,12 +112,13 @@ def read_string_from_stream( stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: - tok = b"" + continue else: msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" logger_warning(msg, __name__) - txt.append(tok) - return create_string_object(b"".join(txt), forced_encoding) + txt.append(__BACKSLASH_CODE__) + txt.append(ord(tok)) + return create_string_object(bytes(txt), forced_encoding) def create_string_object( diff --git a/pypdf/types.py b/pypdf/types.py index b8fbab92cf..e383dc7b1f 100644 --- a/pypdf/types.py +++ b/pypdf/types.py @@ -1,13 +1,7 @@ """Helpers for working with PDF types.""" import sys -from typing import List, Union - -if sys.version_info[:2] >= (3, 8): - # Python 3.8+: https://peps.python.org/pep-0586 - from typing import Literal -else: - from typing_extensions import Literal +from typing import List, Literal, Union if sys.version_info[:2] >= (3, 10): # Python 3.10+: https://www.python.org/dev/peps/pep-0484 diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index f382fe2b94..2101771181 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -6,7 +6,7 @@ # attrs==23.1.0 # via flake8-bugbear -coverage[toml]==7.3.0 +coverage[toml]==7.6.0 # via # -r requirements/ci.in # pytest-cov @@ -35,7 +35,7 @@ mypy-extensions==1.0.0 # via mypy packaging==23.1 # via pytest -pillow==10.0.1 +pillow==10.4.0 # via # -r requirements/ci.in # fpdf2 diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index 0e9633ac16..a53f28f0be 100644 Binary files a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf and b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf differ diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 9dcfb252d5..371db8c245 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -1,13 +1,19 @@ """Test the pypdf_cmap module.""" from io import BytesIO +from pathlib import Path import pytest -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from pypdf._cmap import build_char_map +from pypdf.generic import ArrayObject, NameObject, NullObject from . import get_data_from_url +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + @pytest.mark.enable_socket() @pytest.mark.slow() @@ -206,3 +212,29 @@ def test_eten_b5(): """Issue #2356""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") + + +@pytest.mark.enable_socket() +def test_missing_entries_in_cmap(): + """ + Issue #2702: this issue is observed on damaged pdfs + use of this file in test has been discarded as too slow/long + we will create the same error from crazyones + """ + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + p = reader.pages[0] + p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject( + 99999999, 0, reader + ) + p.extract_text() + + +def test_null_missing_width(): + """For coverage of 2792""" + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + page = writer.pages[0] + ft = page["/Resources"]["/Font"]["/F1"] + ft[NameObject("/Widths")] = ArrayObject() + ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject() + page.extract_text() diff --git a/tests/test_page.py b/tests/test_page.py index cb7b6c723f..72df648e45 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1131,9 +1131,9 @@ def test_merge_page_resources_smoke_test(): # use these keys for some "operations", to validate renaming # (the operand name doesn't matter) contents1 = page1[NO("/Contents")] = ContentStream(None, None) - contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")] + contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")] contents2 = page2[NO("/Contents")] = ContentStream(None, None) - contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")] + contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")] expected_properties = { "/just1": "/just1-value", @@ -1438,3 +1438,12 @@ def test_negative_index(): src_abs = RESOURCE_ROOT / "git.pdf" reader = PdfReader(src_abs) assert reader.pages[0] == reader.pages[-1] + + +def test_get_contents_as_bytes(): + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + co = writer.pages[0]["/Contents"][0] + expected = co.get_data() + assert writer.pages[0]._get_contents_as_bytes() == expected + writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0] + assert writer.pages[0]._get_contents_as_bytes() == expected diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 1ffa68a3e6..dcd4e6caeb 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -10,6 +10,7 @@ from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl +from pypdf.errors import ParseError from . import get_data_from_url @@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths(): encoding="utf-8" ) assert expected == reader.pages[0].extract_text(extraction_mode="layout") + + +@pytest.mark.enable_socket() +def test_layout_mode_indirect_sequence_font_widths(): + # Cover the situation where the sequence for font widths is an IndirectObject + # ref https://github.com/py-pdf/pypdf/pull/2788 + url = "https://github.com/user-attachments/files/16491621/2788_example.pdf" + name ="2788_example.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].extract_text(extraction_mode="layout") == "" + url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf" + name = "2788_example_malformed.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + with pytest.raises(ParseError) as exc: + reader.pages[0].extract_text(extraction_mode="layout") + assert str(exc.value).startswith("Invalid font width definition") diff --git a/tests/test_utils.py b/tests/test_utils.py index 81fcf9fb47..a4ddff8831 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -109,22 +109,6 @@ def test_mark_location(): Path("pypdf_pdfLocation.txt").unlink() # cleanup -@pytest.mark.parametrize( - ("input_str", "expected"), - [ - ("foo", b"foo"), - ("😀", "😀".encode()), - ("‰", "‰".encode()), - ("▷", "▷".encode()), - ("世", "世".encode()), - # A multi-character string example with non-latin-1 characters: - ("😀😃", "😀😃".encode()), - ], -) -def test_b(input_str: str, expected: bytes): - assert pypdf._utils.b_(input_str) == expected - - def test_deprecate_no_replacement(): with pytest.warns(DeprecationWarning) as warn: pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0") @@ -132,24 +116,6 @@ def test_deprecate_no_replacement(): assert warn[0].message.args[0] == error_msg -@pytest.mark.parametrize( - ("left", "up", "upleft", "expected"), - [ - (0, 0, 0, 0), - (1, 0, 0, 1), - (0, 1, 0, 1), - (0, 0, 1, 0), - (1, 2, 3, 1), - (2, 1, 3, 1), - (1, 3, 2, 2), - (3, 1, 2, 2), - (3, 2, 1, 3), - ], -) -def test_paeth_predictor(left, up, upleft, expected): - assert pypdf._utils.paeth_predictor(left, up, upleft) == expected - - @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 93bc0c9e5e..4407b8fd5e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -391,11 +391,11 @@ def test_merge(tmp_path, url, name): { "/Author": "Unknown", "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM", - "/Creator": "C:DEBÆł8", + "/Creator": r"C:\DEB\6338", "/Keywords": "", "/Producer": "Acrobat PDFWriter 3.02 for Windows", "/Subject": "", - "/Title": "C:DEBÆł8-6R.PDF", + "/Title": r"C:\DEB\6338-6R.PDF", }, ) ], diff --git a/tests/test_writer.py b/tests/test_writer.py index 9dfeffdd89..eea1c6a488 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1290,7 +1290,7 @@ def test_attachments(): to_add = [ ("foobar.txt", b"foobarcontent"), ("foobar2.txt", b"foobarcontent2"), - ("foobar2.txt", b"2nd_foobarcontent"), + ("foobar2.txt", "2nd_foobarcontent"), ] for name, content in to_add: writer.add_attachment(name, content) @@ -2188,6 +2188,10 @@ def test_replace_object(): reader._replace_object(reader.pages[0].indirect_reference, reader.pages[0]) pg = PageObject.create_blank_page(writer, 1000, 1000) reader._replace_object(reader.pages[0].indirect_reference, pg) + pg = PageObject.create_blank_page(None, 1000, 1000) + pg[NameObject("/Contents")] = writer.pages[0]["/Contents"] + writer._add_object(pg) + writer.add_page(pg) def test_mime_jupyter(): @@ -2300,3 +2304,9 @@ def test_matrix_entry_in_field_annots(): auto_regenerate=False, ) assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"] + + +def test_set_need_appearances_writer(): + """Minimal test for coverage""" + writer = PdfWriter() + writer.set_need_appearances_writer() diff --git a/tests/test_xmp.py b/tests/test_xmp.py index f864a9df9d..6615b93c8e 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -7,7 +7,7 @@ import pypdf.generic import pypdf.xmp -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError from . import get_data_from_url @@ -42,6 +42,35 @@ def test_read_xmp_metadata_samples(src): } +def test_writer_xmp_metadata_samples(): + writer = PdfWriter(SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf") + xmp = writer.xmp_metadata + assert xmp + assert xmp.dc_contributor == [] + assert xmp.dc_creator == ["John Doe"] + assert xmp.dc_source == "Martin Thoma" # attribute node + assert xmp.dc_description == {"x-default": "This is a text"} + assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)] + assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"} + assert xmp.custom_properties == { + "Style": "FooBarStyle", + "other": "worlds", + "⏰": "time", + } + co = pypdf.generic.ContentStream(None, None) + co.set_data( + xmp.stream.get_data().replace( + b'dc:source="Martin Thoma"', b'dc:source="Pubpub-Zz"' + ) + ) + writer.xmp_metadata = pypdf.xmp.XmpInformation(co) + b = BytesIO() + writer.write(b) + reader = PdfReader(b) + xmp2 = reader.xmp_metadata + assert xmp2.dc_source == "Pubpub-Zz" + + @pytest.mark.parametrize( ("src", "has_xmp"), [ diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py index 63ecebd9b4..39b7131fcd 100644 --- a/tests/test_xobject_image_helpers.py +++ b/tests/test_xobject_image_helpers.py @@ -4,8 +4,8 @@ import pytest from pypdf import PdfReader -from pypdf._xobj_image_helpers import _handle_flate -from pypdf.errors import PdfReadError +from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate +from pypdf.errors import EmptyImageDataError, PdfReadError from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject from . import get_data_from_url @@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1(): colors=2, obj_as_text="dummy", ) + + +def test_extended_image_frombytes_zero_data(): + mode = "RGB" + size = (1, 1) + data = b"" + + with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."): + _extended_image_frombytes(mode, size, data)