From 4bd54bd32d525ec41e23960146fbb3b0418a4715 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Sun, 28 Jul 2024 17:16:57 +0200 Subject: [PATCH 01/16] DEV: Test against Python 3.13 (#2776) * DEV: Test against Python 3.13 * fix typo * add missing setup-python * fix another typo * update Pillow version * attempt to update coverage package * update number of expected coverage files --- .github/workflows/github-ci.yaml | 10 +++++----- requirements/ci-3.11.txt | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 820ccdcaa5..1eb3d9bd03 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -57,7 +57,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] use-crypto-lib: ["cryptography"] include: - python-version: "3.7" @@ -90,7 +90,7 @@ jobs: cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) uses: actions/setup-python@v5 - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' with: python-version: ${{ matrix.python-version }} allow-prereleases: true @@ -106,7 +106,7 @@ jobs: - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' - name: Remove pycryptodome and cryptography run: | pip uninstall pycryptodome cryptography -y @@ -215,8 +215,8 @@ jobs: - name: Check Number of Downloaded Files run: | downloaded_files_count=$(find \.coverage* -type f | wc -l) - if [ $downloaded_files_count -eq 8 ]; then - echo "The expected number of files (8) were downloaded." + if [ $downloaded_files_count -eq 9 ]; then + echo "The expected number of files (9) were downloaded." else echo "ERROR: Expected 8 files, but found $downloaded_files_count files." exit 1 diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index f382fe2b94..2101771181 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -6,7 +6,7 @@ # attrs==23.1.0 # via flake8-bugbear -coverage[toml]==7.3.0 +coverage[toml]==7.6.0 # via # -r requirements/ci.in # pytest-cov @@ -35,7 +35,7 @@ mypy-extensions==1.0.0 # via mypy packaging==23.1 # via pytest -pillow==10.0.1 +pillow==10.4.0 # via # -r requirements/ci.in # fpdf2 From d4df20d14cb6a2839c1ab141b51e70652fb3d1f1 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:46:08 +0100 Subject: [PATCH 02/16] STY: Remove boolean value comparison (#2779) PEP 8 recommendation. --- pypdf/annotations/_markup_annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py index 4db8dfdbf0..98a222483b 100644 --- a/pypdf/annotations/_markup_annotations.py +++ b/pypdf/annotations/_markup_annotations.py @@ -104,9 +104,9 @@ def __init__( self[NameObject("/Rect")] = RectangleObject(rect) font_str = "font: " - if bold is True: + if bold: font_str = f"{font_str}bold " - if italic is True: + if italic: font_str = f"{font_str}italic " font_str = f"{font_str}{font} {font_size}" font_str = f"{font_str};text-align:left;color:#{font_color}" From 3ad9234c2ec08e7cd6a8b2ec962386eda394d76d Mon Sep 17 00:00:00 2001 From: "William G. Gagnon" Date: Fri, 2 Aug 2024 11:21:53 -0400 Subject: [PATCH 03/16] ROB: Handle images with empty data when processing an image from bytes (#2786) Closes #2783. --- CONTRIBUTORS.md | 1 + pypdf/_xobj_image_helpers.py | 9 ++++++--- pypdf/errors.py | 4 ++++ tests/test_xobject_image_helpers.py | 13 +++++++++++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 84f0b6ee43..89fec3b14e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) +* [Gagnon, William G.](https://github.com/williamgagnon) * [Górny, Michał](https://github.com/mgorny) * [Grillo, Miguel](https://github.com/Ineffable22) * [Gutteridge, David H.](https://github.com/dhgutteridge) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 45b0c145be..5ae8894fa3 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -6,7 +6,7 @@ from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces -from .errors import PdfReadError +from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, DecodedStreamObject, @@ -148,9 +148,12 @@ def _extended_image_frombytes( img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] - if len(data) % nb_pix != 0: + data_length = len(data) + if data_length == 0: + raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc + if data_length % nb_pix != 0: raise exc - k = nb_pix * len(mode) / len(data) + k = nb_pix * len(mode) / data_length data = b"".join([bytes((x,) * int(k)) for x in data]) img = Image.frombytes(mode, size, data) return img diff --git a/pypdf/errors.py b/pypdf/errors.py index c962dec662..ad197ffc11 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" +class EmptyImageDataError(PyPdfError): + """Raised when trying to process an image that has no data.""" + + STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py index 63ecebd9b4..39b7131fcd 100644 --- a/tests/test_xobject_image_helpers.py +++ b/tests/test_xobject_image_helpers.py @@ -4,8 +4,8 @@ import pytest from pypdf import PdfReader -from pypdf._xobj_image_helpers import _handle_flate -from pypdf.errors import PdfReadError +from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate +from pypdf.errors import EmptyImageDataError, PdfReadError from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject from . import get_data_from_url @@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1(): colors=2, obj_as_text="dummy", ) + + +def test_extended_image_frombytes_zero_data(): + mode = "RGB" + size = (1, 1) + data = b"" + + with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."): + _extended_image_frombytes(mode, size, data) From 582557e09a7e658fdcb19f26eb069d87875489f0 Mon Sep 17 00:00:00 2001 From: Diogo Teles Sant'Anna Date: Fri, 2 Aug 2024 15:49:29 -0300 Subject: [PATCH 04/16] SEC: Fix GitHub workflow vulnerable to script injection (#2787) Signed-off-by: Diogo Teles Sant'Anna --- .github/workflows/release.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9f782ec080..b1a4fb27f3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -12,6 +12,9 @@ on: permissions: contents: write +env: + HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + jobs: build_and_publish: name: Publish a new version @@ -24,7 +27,7 @@ jobs: - name: Extract version from commit message id: extract_version run: | - VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') + VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Extract tag message from commit message @@ -32,7 +35,7 @@ jobs: run: | VERSION="${{ steps.extract_version.outputs.version }}" delimiter="$(openssl rand -hex 8)" - MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" ) + MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) echo "message<<${delimiter}" >> $GITHUB_OUTPUT echo "$MESSAGE" >> $GITHUB_OUTPUT echo "${delimiter}" >> $GITHUB_OUTPUT From 38f3925502c2971ad587fb616500b6f8b6333d03 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:10:47 +0100 Subject: [PATCH 05/16] MAINT: Remove unused paeth_predictor (#2773) --- pypdf/_utils.py | 14 -------------- tests/test_utils.py | 18 ------------------ 2 files changed, 32 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 38c0d67d7a..6569707b66 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -390,20 +390,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" -def paeth_predictor(left: int, up: int, up_left: int) -> int: - p = left + up - up_left - dist_left = abs(p - left) - dist_up = abs(p - up) - dist_up_left = abs(p - up_left) - - if dist_left <= dist_up and dist_left <= dist_up_left: - return left - elif dist_up <= dist_up_left: - return up - else: - return up_left - - def deprecate(msg: str, stacklevel: int = 3) -> None: warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) diff --git a/tests/test_utils.py b/tests/test_utils.py index 81fcf9fb47..856bedd863 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -132,24 +132,6 @@ def test_deprecate_no_replacement(): assert warn[0].message.args[0] == error_msg -@pytest.mark.parametrize( - ("left", "up", "upleft", "expected"), - [ - (0, 0, 0, 0), - (1, 0, 0, 1), - (0, 1, 0, 1), - (0, 0, 1, 0), - (1, 2, 3, 1), - (2, 1, 3, 1), - (1, 3, 2, 2), - (3, 1, 2, 2), - (3, 2, 1, 3), - ], -) -def test_paeth_predictor(left, up, upleft, expected): - assert pypdf._utils.paeth_predictor(left, up, upleft) == expected - - @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ From 09f9b7ed52193bfd9e98bdd018ccaf7cbe821687 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:49:29 +0100 Subject: [PATCH 06/16] MAINT: Remove unused AnnotationFlag --- pypdf/annotations/_non_markup_annotations.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py index dcdb3b0ff8..6272cceee6 100644 --- a/pypdf/annotations/_non_markup_annotations.py +++ b/pypdf/annotations/_non_markup_annotations.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union -from ..constants import AnnotationFlag from ..generic._base import ( BooleanObject, NameObject, @@ -12,8 +11,6 @@ from ..generic._rectangle import RectangleObject from ._base import AnnotationDictionary -DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0) - class Link(AnnotationDictionary): def __init__( From b2d72043ab5221b58138c7d06c181b8cbc88ea8e Mon Sep 17 00:00:00 2001 From: owurman Date: Mon, 5 Aug 2024 12:14:18 -0700 Subject: [PATCH 07/16] BUG: Handle Sequence as an IndirectObject when extracting text with layout mode (#2788) * Handle Sequence as an IndirectObject The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility. * Update pypdf/_text_extraction/_layout_mode/_font.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> * Address PR comments -Rename w_1 to w_next_entry -Utilize ParseError instead of PdfReadError -Write a test (both positive and negative) * Handle unlikely case of IndirectObjects for float/int width elements Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError. * Yoda condition I removed * Last commit was a bad patch, confused by non-committed changes * Use test files from URL rather than resources * Update tests/test_text_extraction.py Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> * Fix code style warnings in range() call --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- pypdf/_text_extraction/_layout_mode/_font.py | 26 +++++++++++++------- tests/test_text_extraction.py | 17 +++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index a912fddb27..40655b1b22 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -1,8 +1,9 @@ """Font constants and classes for "layout" mode text operations""" from dataclasses import dataclass, field -from typing import Any, Dict, Sequence, Union +from typing import Any, Dict, Sequence, Union, cast +from ...errors import ParseError from ...generic import IndirectObject from ._font_widths import STANDARD_WIDTHS @@ -58,6 +59,7 @@ def __post_init__(self) -> None: skip_count = 0 _w = d_font.get("/W", []) for idx, w_entry in enumerate(_w): + w_entry = w_entry.get_object() if skip_count: skip_count -= 1 continue @@ -66,13 +68,14 @@ def __post_init__(self) -> None: # warning and or use reader's "strict" to force an ex??? continue # check for format (1): `int [int int int int ...]` - if isinstance(_w[idx + 1], Sequence): - start_idx, width_list = _w[idx : idx + 2] + w_next_entry = _w[idx + 1].get_object() + if isinstance(w_next_entry, Sequence): + start_idx, width_list = w_entry, w_next_entry self.width_map.update( { ord_map[_cidx]: _width for _cidx, _width in zip( - range(start_idx, start_idx + len(width_list), 1), + range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), width_list, ) if _cidx in ord_map @@ -80,18 +83,23 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - if not isinstance(_w[idx + 1], Sequence) and not isinstance( - _w[idx + 2], Sequence - ): - start_idx, stop_idx, const_width = _w[idx : idx + 3] + elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): + start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(start_idx, stop_idx + 1, 1) + for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) if _cidx in ord_map } ) skip_count = 2 + else: + # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions + # while expecting more elements). This raises an IndexError which is sufficient. + raise ParseError( + f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" + ) # pragma: no cover + if not self.width_map and "/BaseFont" in self.font_dictionary: for key in STANDARD_WIDTHS: if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 1ffa68a3e6..dcd4e6caeb 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -10,6 +10,7 @@ from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl +from pypdf.errors import ParseError from . import get_data_from_url @@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths(): encoding="utf-8" ) assert expected == reader.pages[0].extract_text(extraction_mode="layout") + + +@pytest.mark.enable_socket() +def test_layout_mode_indirect_sequence_font_widths(): + # Cover the situation where the sequence for font widths is an IndirectObject + # ref https://github.com/py-pdf/pypdf/pull/2788 + url = "https://github.com/user-attachments/files/16491621/2788_example.pdf" + name ="2788_example.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].extract_text(extraction_mode="layout") == "" + url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf" + name = "2788_example_malformed.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + with pytest.raises(ParseError) as exc: + reader.pages[0].extract_text(extraction_mode="layout") + assert str(exc.value).startswith("Invalid font width definition") From 5abd590740a2718fc69b8477c656ce5515a0ab33 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:14:17 +0100 Subject: [PATCH 08/16] STY: Refactor b_ (#2772) --- pypdf/_utils.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 6569707b66..5fecb38e7d 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -347,14 +347,11 @@ def b_(s: Union[str, bytes]) -> bytes: return bc[s] try: r = s.encode("latin-1") - if len(s) < 2: - bc[s] = r - return r - except Exception: + except UnicodeEncodeError: r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r + if len(s) < 2: + bc[s] = r + return r def str_(b: Any) -> str: From 1f83d529fd8ff4dad5c79320d92f87d90b42d1c6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Aug 2024 15:22:30 +0200 Subject: [PATCH 09/16] MNT: Remove b_ and str_ closes #2726 superseed proposal #2791 --- pypdf/_cmap.py | 7 +- pypdf/_doc_common.py | 3 +- pypdf/_encryption.py | 6 +- pypdf/_merger.py | 10 ++- pypdf/_page.py | 16 ++-- pypdf/_reader.py | 5 +- pypdf/_utils.py | 34 ++----- pypdf/_writer.py | 6 +- pypdf/filters.py | 30 +++---- pypdf/generic/_base.py | 43 +++++---- pypdf/generic/_data_structures.py | 35 ++++---- pypdf/generic/_utils.py | 83 +++++++++++------- ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217096 -> 217093 bytes tests/test_page.py | 4 +- tests/test_utils.py | 16 ---- tests/test_workflows.py | 4 +- 16 files changed, 147 insertions(+), 155 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9a2d10a611..0859aab308 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding -from ._utils import b_, logger_error, logger_warning +from ._utils import logger_error, logger_warning from .generic import ( DecodedStreamObject, DictionaryObject, @@ -258,7 +258,8 @@ def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): - cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) + cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() + assert isinstance(cm, bytes) elif isinstance(tu, str) and tu.startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" @@ -474,7 +475,7 @@ def compute_space_width( obj = sp_width.get_object() if obj is None or isinstance(obj, NullObject): return 0.0 - return obj # type: ignore + return obj return sp_width diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index d4c5c43c3c..ffbdb7882e 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -49,7 +49,6 @@ from ._page import PageObject, _VirtualList from ._page_labels import index2label as page_index2page_label from ._utils import ( - b_, deprecate_with_replacement, logger_warning, parse_iso8824_date, @@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: - es = zlib.decompress(b_(field._data)) + es = zlib.decompress(field._data) retval[tag] = es return retval diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 5ddd8d0efe..e5cdd9324e 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -43,7 +43,7 @@ rc4_encrypt, ) -from ._utils import b_, logger_warning +from ._utils import logger_warning from .generic import ( ArrayObject, ByteStringObject, @@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) - obj2.set_data(self.stm_crypt.encrypt(b_(obj._data))) + obj2.set_data(self.stm_crypt.encrypt(obj._data)) for key, value in obj.items(): # Dont forget the Stream dict. obj2[key] = self.encrypt_object(value) obj = obj2 @@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: data = self.str_crypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): - obj._data = self.stm_crypt.decrypt(b_(obj._data)) + obj._data = self.stm_crypt.decrypt(obj._data) for key, value in obj.items(): # Dont forget the Stream dict. obj[key] = self.decrypt_object(value) elif isinstance(obj, DictionaryObject): diff --git a/pypdf/_merger.py b/pypdf/_merger.py index 7176a1adf7..a077cca7f4 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -46,7 +46,6 @@ from ._utils import ( StrByteType, deprecate_with_replacement, - str_, ) from ._writer import PdfWriter from .constants import GoToActionArguments, TypArguments, TypFitArguments @@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: self.id = id +# transfered from _utils : as this function is only required here +# and merger will be soon deprecated +def str_(b: Any) -> str: + if isinstance(b, bytes): + return b.decode("latin-1") + else: + return str(b) # will return b.__str__() if defined + + class PdfMerger: """ Use :class:`PdfWriter` instead. diff --git a/pypdf/_page.py b/pypdf/_page.py index 63038d9d07..8abd66d2f2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -852,7 +852,7 @@ def _add_transformation_matrix( FloatObject(e), FloatObject(f), ], - " cm", + b"cm", ], ) return contents @@ -870,7 +870,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]: if isinstance(obj, list): return b"".join(x.get_object().get_data() for x in obj) else: - return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + return cast(EncodedStreamObject, obj).get_data() else: return None @@ -1063,11 +1063,11 @@ def _merge_page( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( @@ -1201,11 +1201,11 @@ def _merge_page_writer( rect.height, ], ), - "re", + b"re", ), ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) + page2content.operations.insert(1, ([], b"W")) + page2content.operations.insert(2, ([], b"n")) if page2transformation is not None: page2content = page2transformation(page2content) page2content = PageObject._content_stream_rename( diff --git a/pypdf/_reader.py b/pypdf/_reader.py index aeababa7b7..7c084107c5 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -51,7 +51,6 @@ from ._utils import ( StrByteType, StreamType, - b_, logger_warning, read_non_whitespace, read_previous_line, @@ -328,7 +327,7 @@ def _get_object_from_stream( assert cast(str, obj_stm["/Type"]) == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) + stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) stream_data.seek(-1, 1) @@ -932,7 +931,7 @@ def _read_pdf15_xref_stream( xrefstream = cast(ContentStream, read_object(stream, self)) assert cast(str, xrefstream["/Type"]) == "/XRef" self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) + stream_data = BytesIO(xrefstream.get_data()) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 5fecb38e7d..94d45cf6d7 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -336,31 +336,6 @@ def mark_location(stream: StreamType) -> None: stream.seek(-radius, 1) -B_CACHE: Dict[str, bytes] = {} - - -def b_(s: Union[str, bytes]) -> bytes: - if isinstance(s, bytes): - return s - bc = B_CACHE - if s in bc: - return bc[s] - try: - r = s.encode("latin-1") - except UnicodeEncodeError: - r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r - - -def str_(b: Any) -> str: - if isinstance(b, bytes): - return b.decode("latin-1") - else: - return str(b) # will return b.__str__() if defined - - @overload def ord_(b: str) -> int: ... @@ -397,12 +372,17 @@ def deprecation(msg: str) -> None: def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature will be removed, but has a replacement.""" - deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4) + deprecate( + f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", + 4, + ) def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None: """Raise an exception that a feature was already removed, but has a replacement.""" - deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.") + deprecation( + f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." + ) def deprecate_no_replacement(name: str, removed_in: str) -> None: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 00b9d498c0..d73c00e3d4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -62,7 +62,6 @@ StrByteType, StreamType, _get_max_pdf_version_header, - b_, deprecate_with_replacement, logger_warning, ) @@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # Hello world! # endstream # endobj - + if isinstance(data, str): + data = data.encode("latin-1") file_entry = DecodedStreamObject() - file_entry.set_data(b_(data)) + file_entry.set_data(data) file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) # The Filespec entry diff --git a/pypdf/filters.py b/pypdf/filters.py index 137e3603a3..d14b765801 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,6 @@ from ._utils import ( WHITESPACES_AS_BYTES, - b_, deprecate_with_replacement, deprecation_no_replacement, logger_warning, @@ -376,20 +375,18 @@ class LZWDecode: """ Taken from: - http://www.java2s.com/Open-Source/Java-Document/PDF/PDF- - Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ class Decoder: + STOP = 257 + CLEARDICT = 256 + def __init__(self, data: bytes) -> None: - self.STOP = 257 - self.CLEARDICT = 256 self.data = data self.bytepos = 0 self.bitpos = 0 - self.dict = [""] * 4096 - for i in range(256): - self.dict[i] = chr(i) + self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256) self.reset_dict() def reset_dict(self) -> None: @@ -416,7 +413,7 @@ def next_code(self) -> int: self.bytepos = self.bytepos + 1 return value - def decode(self) -> str: + def decode(self) -> bytes: """ TIFF 6.0 specification explains in sufficient details the steps to implement the LZW encode() and decode() algorithms. @@ -429,7 +426,7 @@ def decode(self) -> str: PdfReadError: If the stop code is missing """ cW = self.CLEARDICT - baos = "" + baos = b"" while True: pW = cW cW = self.next_code() @@ -444,11 +441,11 @@ def decode(self) -> str: else: if cW < self.dictlen: baos += self.dict[cW] - p = self.dict[pW] + self.dict[cW][0] + p = self.dict[pW] + self.dict[cW][0:1] self.dict[self.dictlen] = p self.dictlen += 1 else: - p = self.dict[pW] + self.dict[pW][0] + p = self.dict[pW] + self.dict[pW][0:1] baos += p self.dict[self.dictlen] = p self.dictlen += 1 @@ -464,7 +461,7 @@ def decode( data: bytes, decode_parms: Optional[DictionaryObject] = None, **kwargs: Any, - ) -> str: + ) -> bytes: """ Decode an LZW encoded data stream. @@ -476,7 +473,6 @@ def decode( decoded data. """ # decode_parms is unused here - return LZWDecode.Decoder(data).decode() @@ -651,7 +647,7 @@ def decode( return tiff_header + data -def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject +def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject """ Decode the stream data based on the specified filters. @@ -678,7 +674,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decodparms, (list, tuple)): decodparms = (decodparms,) - data: bytes = b_(stream._data) + data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: for filter_type, params in zip(filters, decodparms): @@ -691,7 +687,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL): data = RunLengthDecode.decode(data) elif filter_type in (FT.LZW_DECODE, FTA.LZW): - data = LZWDecode.decode(data, params) # type: ignore + data = LZWDecode.decode(data, params) elif filter_type in (FT.ASCII_85_DECODE, FTA.A85): data = ASCII85Decode.decode(data) elif filter_type == FT.DCT_DECODE: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 2d606b4184..4e1cf1bca0 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -30,18 +30,17 @@ import re from binascii import unhexlify from math import log10 +from struct import iter_unpack from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast from .._codecs import _pdfdoc_encoding_rev from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( StreamType, - b_, deprecate_no_replacement, logger_warning, read_non_whitespace, read_until_regex, - str_, ) from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError @@ -308,6 +307,14 @@ def __getitem__(self, key: Any) -> Any: # items should be extracted from pointed Object return self._get_object_with_check()[key] # type: ignore + def __float__(self) -> str: + # in this case we are looking for the pointed data + return self.get_object().__float__() # type: ignore + + def __int__(self) -> str: + # in this case we are looking for the pointed data + return self.get_object().__int__() # type: ignore + def __str__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__str__() @@ -369,10 +376,10 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfRe class FloatObject(float, PdfObject): def __new__( - cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None + cls, value: Any = "0.0", context: Optional[Any] = None ) -> "FloatObject": try: - value = float(str_(value)) + value = float(value) return float.__new__(cls, value) except Exception as e: # If this isn't a valid decimal (happens in malformed PDFs) @@ -599,15 +606,16 @@ def write_to_stream( ) bytearr = self.get_encoded_bytes() stream.write(b"(") - for c in bytearr: - if not chr(c).isalnum() and c != b" ": + for c_ in iter_unpack("c", bytearr): + c = cast(bytes, c_[0]) + if not c.isalnum() and c != b" ": # This: # stream.write(rf"\{c:0>3o}".encode()) # gives # https://github.com/davidhalter/parso/issues/207 - stream.write(("\\%03o" % c).encode()) + stream.write(b"\\%03o" % ord(c)) else: - stream.write(b_(chr(c))) + stream.write(c) stream.write(b")") @@ -710,12 +718,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader def encode_pdfdocencoding(unicode_string: str) -> bytes: - retval = bytearray() - for c in unicode_string: - try: - retval += b_(chr(_pdfdoc_encoding_rev[c])) - except KeyError: - raise UnicodeEncodeError( - "pdfdocencoding", c, -1, -1, "does not exist in translation table" - ) - return bytes(retval) + try: + return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string]) + except KeyError: + raise UnicodeEncodeError( + "pdfdocencoding", + unicode_string, + -1, + -1, + "does not exist in translation table", + ) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 87d6886742..399836be5f 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -52,7 +52,6 @@ from .._utils import ( WHITESPACES, StreamType, - b_, deprecate_no_replacement, deprecate_with_replacement, logger_warning, @@ -843,7 +842,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None: class StreamObject(DictionaryObject): def __init__(self) -> None: - self._data: Union[bytes, str] = b"" + self._data: bytes = b"" self.decoded_self: Optional[DecodedStreamObject] = None def _clone( @@ -877,7 +876,7 @@ def _clone( pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: return self._data def set_data(self, data: bytes) -> None: @@ -885,7 +884,7 @@ def set_data(self, data: bytes) -> None: def hash_value_data(self) -> bytes: data = super().hash_value_data() - data += b_(self._data) + data += self._data return data def write_to_stream( @@ -955,7 +954,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval[NameObject(SA.FILTER)] = f if params is not None: retval[NameObject(SA.DECODE_PARMS)] = params - retval._data = FlateDecode.encode(b_(self._data), level) + retval._data = FlateDecode.encode(self._data, level) return retval def decode_as_image(self) -> Any: @@ -993,7 +992,7 @@ def __init__(self) -> None: self.decoded_self: Optional[DecodedStreamObject] = None # This overrides the parent method: - def get_data(self) -> Union[bytes, str]: + def get_data(self) -> bytes: from ..filters import decode_stream_data if self.decoded_self is not None: @@ -1003,7 +1002,7 @@ def get_data(self) -> Union[bytes, str]: # create decoded object decoded = DecodedStreamObject() - decoded.set_data(b_(decode_stream_data(self))) + decoded.set_data(decode_stream_data(self)) for key, value in list(self.items()): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value @@ -1058,7 +1057,7 @@ def __init__( # The inner list has two elements: # Element 0: List # Element 1: str - self._operations: List[Tuple[Any, Any]] = [] + self._operations: List[Tuple[Any, bytes]] = [] # stream may be a StreamObject or an ArrayObject containing # multiple StreamObjects to be cat'd together. @@ -1069,14 +1068,14 @@ def __init__( if isinstance(stream, ArrayObject): data = b"" for s in stream: - data += b_(s.get_object().get_data()) + data += s.get_object().get_data() if len(data) == 0 or data[-1] != b"\n": data += b"\n" super().set_data(bytes(data)) else: stream_data = stream.get_data() assert stream_data is not None - super().set_data(b_(stream_data)) + super().set_data(stream_data) self.forced_encoding = forced_encoding def clone( @@ -1132,7 +1131,7 @@ def _clone( ignore_fields: """ src_cs = cast("ContentStream", src) - super().set_data(b_(src_cs._data)) + super().set_data(src_cs._data) self.pdf = pdf_dest self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding @@ -1249,10 +1248,10 @@ def get_data(self) -> bytes: for op in operands: op.write_to_stream(new_data) new_data.write(b" ") - new_data.write(b_(operator)) + new_data.write(operator) new_data.write(b"\n") self._data = new_data.getvalue() - return b_(self._data) + return self._data # This overrides the parent method: def set_data(self, data: bytes) -> None: @@ -1262,21 +1261,21 @@ def set_data(self, data: bytes) -> None: @property def operations(self) -> List[Tuple[Any, Any]]: if not self._operations and self._data: - self._parse_content_stream(BytesIO(b_(self._data))) + self._parse_content_stream(BytesIO(self._data)) self._data = b"" return self._operations @operations.setter - def operations(self, operations: List[Tuple[Any, Any]]) -> None: + def operations(self, operations: List[Tuple[Any, bytes]]) -> None: self._operations = operations self._data = b"" def isolate_graphics_state(self) -> None: if self._operations: - self._operations.insert(0, ([], "q")) - self._operations.append(([], "Q")) + self._operations.insert(0, ([], b"q")) + self._operations.append(([], b"Q")) elif self._data: - self._data = b"q\n" + b_(self._data) + b"\nQ\n" + self._data = b"q\n" + self._data + b"\nQ\n" # This overrides the parent method: def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index fdcdc33399..5d47a8b5aa 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -2,7 +2,7 @@ from typing import Dict, List, Tuple, Union from .._codecs import _pdfdoc_encoding -from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from .._utils import StreamType, logger_warning, read_non_whitespace from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError from ._base import ByteStringObject, TextStringObject @@ -16,7 +16,7 @@ def read_hex_string_from_stream( forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: stream.read(1) - txt = "" + arr = [] x = b"" while True: tok = read_non_whitespace(stream) @@ -26,13 +26,40 @@ def read_hex_string_from_stream( break x += tok if len(x) == 2: - txt += chr(int(x, base=16)) + arr.append(int(x, base=16)) x = b"" if len(x) == 1: x += b"0" - if len(x) == 2: - txt += chr(int(x, base=16)) - return create_string_object(b_(txt), forced_encoding) + if x != b"": + arr.append(int(x, base=16)) + return create_string_object(bytes(arr), forced_encoding) + + +__ESPACE_DICT__ = { + b"n": ord(b"\n"), + b"r": ord(b"\r"), + b"t": ord(b"\t"), + b"b": ord(b"\b"), + b"f": ord(b"\f"), + # b"c": ord(rb"\c"), + b"(": ord(b"("), + b")": ord(b")"), + b"/": ord(b"/"), + b"\\": ord(b"\\"), + b" ": ord(b" "), + b"%": ord(b"%"), + b"<": ord(b"<"), + b">": ord(b">"), + b"[": ord(b"["), + b"]": ord(b"]"), + b"#": ord(b"#"), + b"_": ord(b"_"), + b"&": ord(b"&"), + b"$": ord( + b"$", + ), +} +__BACKSLASH_CODE__ = 92 def read_string_from_stream( @@ -54,30 +81,12 @@ def read_string_from_stream( break elif tok == b"\\": tok = stream.read(1) - escape_dict = { - b"n": b"\n", - b"r": b"\r", - b"t": b"\t", - b"b": b"\b", - b"f": b"\f", - b"c": rb"\c", - b"(": b"(", - b")": b")", - b"/": b"/", - b"\\": b"\\", - b" ": b" ", - b"%": b"%", - b"<": b"<", - b">": b">", - b"[": b"[", - b"]": b"]", - b"#": b"#", - b"_": b"_", - b"&": b"&", - b"$": b"$", - } try: - tok = escape_dict[tok] + if tok == b"c": + txt.extend([__BACKSLASH_CODE__, 99]) + else: + txt.append(__ESPACE_DICT__[tok]) + continue except KeyError: if b"0" <= tok <= b"7": # "The number ddd may consist of one, two, or three @@ -85,6 +94,7 @@ def read_string_from_stream( # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) + sav = stream.tell() - 1 for _ in range(2): ntok = stream.read(1) if b"0" <= ntok <= b"7": @@ -92,7 +102,13 @@ def read_string_from_stream( else: stream.seek(-1, 1) # ntok has to be analyzed break - tok = b_(chr(int(tok, base=8))) + i = int(tok, base=8) + if i > 255: + txt.append(__BACKSLASH_CODE__) + stream.seek(sav) + else: + txt.append(i) + continue elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the @@ -102,12 +118,13 @@ def read_string_from_stream( stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: - tok = b"" + continue else: msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" logger_warning(msg, __name__) - txt.append(tok) - return create_string_object(b"".join(txt), forced_encoding) + txt.append(__BACKSLASH_CODE__) + txt.append(ord(tok)) + return create_string_object(bytes(txt), forced_encoding) def create_string_object( diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index 0e9633ac16c138eeaa90d3cf13e9f7cd6e2c006d..a53f28f0be432c38a1fff33672a2170eeb5f553f 100644 GIT binary patch delta 1135 zcmZuwJ5B>Z3>Bn7L8-XFHtGEtkL~mzB#PXGI^7lsK}(f3N1%gya3N-+5Rx~UZB|eA z_*(w%S~Mtv1*EGs-~I8J>sv{zoZ`BgRCDn2^;DWAA#+qhOsy z9_nuY2iNV#)9h}noP_mNYj;O2#kxAbfL+j&#EQMi$qgIN%*aCBkgS9v3tOTYrFns* zi?DXRqIyy{%_l0Z{g4pthD0$;#wiU+yHv)c!>vmue00SNBRm4^N)hR?Vv~jK{H4mw ziIFg?VZq^h6P&9s+GXO#zoHdee$2JJpxz|Hl_r{ zY(@&UHcToGu}TFAn%96`U{@PYmtn8OwFX4O-WCKhW)JHKw%|G;shLP7tsbKDqy5qZ aX;Lhg{J|DdMd<#DhbONh0xtWZ9J{UqF&k&I6)7fzU zDDp4D(!te>XPgE={aR6UiiW`%ZfqTeiIj@xjiPk$e@q9UWn5rLy=@eWjv@^6B7z!u zVL|uoObDgBOUxb4d9AVd3tS3uCsM>HdG}UKHtl4X<1DUS`o&!~`3?a}@xUY!MBL9r zrG8xWxg)aX>X2IbX&d=ezp1ic;>(#79o%KUaWwR|p2uxbk YRyD8Bjbb46Z8kjJvC(vTJUbbE19m{nQvd(} diff --git a/tests/test_page.py b/tests/test_page.py index cb7b6c723f..14e3350cc2 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1131,9 +1131,9 @@ def test_merge_page_resources_smoke_test(): # use these keys for some "operations", to validate renaming # (the operand name doesn't matter) contents1 = page1[NO("/Contents")] = ContentStream(None, None) - contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")] + contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")] contents2 = page2[NO("/Contents")] = ContentStream(None, None) - contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")] + contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")] expected_properties = { "/just1": "/just1-value", diff --git a/tests/test_utils.py b/tests/test_utils.py index 856bedd863..a4ddff8831 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -109,22 +109,6 @@ def test_mark_location(): Path("pypdf_pdfLocation.txt").unlink() # cleanup -@pytest.mark.parametrize( - ("input_str", "expected"), - [ - ("foo", b"foo"), - ("😀", "😀".encode()), - ("‰", "‰".encode()), - ("▷", "▷".encode()), - ("世", "世".encode()), - # A multi-character string example with non-latin-1 characters: - ("😀😃", "😀😃".encode()), - ], -) -def test_b(input_str: str, expected: bytes): - assert pypdf._utils.b_(input_str) == expected - - def test_deprecate_no_replacement(): with pytest.warns(DeprecationWarning) as warn: pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0") diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 93bc0c9e5e..4407b8fd5e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -391,11 +391,11 @@ def test_merge(tmp_path, url, name): { "/Author": "Unknown", "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM", - "/Creator": "C:DEBÆł8", + "/Creator": r"C:\DEB\6338", "/Keywords": "", "/Producer": "Acrobat PDFWriter 3.02 for Windows", "/Subject": "", - "/Title": "C:DEBÆł8-6R.PDF", + "/Title": r"C:\DEB\6338-6R.PDF", }, ) ], From 0befa15427af68b9f22b3e50bdd0c61ee82de19c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:17:52 +0200 Subject: [PATCH 10/16] coverage --- pypdf/_merger.py | 2 +- pypdf/generic/_utils.py | 6 +----- tests/test_generic.py | 4 ++++ tests/test_writer.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pypdf/_merger.py b/pypdf/_merger.py index a077cca7f4..a52a354e38 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -83,7 +83,7 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: # transfered from _utils : as this function is only required here # and merger will be soon deprecated -def str_(b: Any) -> str: +def str_(b: Any) -> str: # pragma: no cover if isinstance(b, bytes): return b.decode("latin-1") else: diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 5d47a8b5aa..14549a2310 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -41,7 +41,6 @@ def read_hex_string_from_stream( b"t": ord(b"\t"), b"b": ord(b"\b"), b"f": ord(b"\f"), - # b"c": ord(rb"\c"), b"(": ord(b"("), b")": ord(b")"), b"/": ord(b"/"), @@ -82,10 +81,7 @@ def read_string_from_stream( elif tok == b"\\": tok = stream.read(1) try: - if tok == b"c": - txt.extend([__BACKSLASH_CODE__, 99]) - else: - txt.append(__ESPACE_DICT__[tok]) + txt.append(__ESPACE_DICT__[tok]) continue except KeyError: if b"0" <= tok <= b"7": diff --git a/tests/test_generic.py b/tests/test_generic.py index b1079974ef..2c28730b0a 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1227,6 +1227,10 @@ def test_cloning(caplog): assert "/Test" in obj21 assert isinstance(obj21.get("/Test2"), IndirectObject) + obj30 = TextStringObject("123") + obj31 = obj30.clone(writer, ignore_fields=None) + assert int(obj31) == 123 + @pytest.mark.enable_socket() def test_append_with_indirectobject_not_pointing(caplog): diff --git a/tests/test_writer.py b/tests/test_writer.py index 9dfeffdd89..84d84d0db6 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1290,7 +1290,7 @@ def test_attachments(): to_add = [ ("foobar.txt", b"foobarcontent"), ("foobar2.txt", b"foobarcontent2"), - ("foobar2.txt", b"2nd_foobarcontent"), + ("foobar2.txt", "2nd_foobarcontent"), ] for name, content in to_add: writer.add_attachment(name, content) From 97ff7e1d1082bdb8b375365b5042f9b528a3a2c9 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Aug 2024 17:13:27 +0200 Subject: [PATCH 11/16] revert --- pypdf/generic/_base.py | 4 ---- tests/test_generic.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 4e1cf1bca0..309d389cc8 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -311,10 +311,6 @@ def __float__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__float__() # type: ignore - def __int__(self) -> str: - # in this case we are looking for the pointed data - return self.get_object().__int__() # type: ignore - def __str__(self) -> str: # in this case we are looking for the pointed data return self.get_object().__str__() diff --git a/tests/test_generic.py b/tests/test_generic.py index 2c28730b0a..b1079974ef 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1227,10 +1227,6 @@ def test_cloning(caplog): assert "/Test" in obj21 assert isinstance(obj21.get("/Test2"), IndirectObject) - obj30 = TextStringObject("123") - obj31 = obj30.clone(writer, ignore_fields=None) - assert int(obj31) == 123 - @pytest.mark.enable_socket() def test_append_with_indirectobject_not_pointing(caplog): From 7adabad4fb39e5e3c584a27c7d1459bd5fdc110f Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:59:11 +0200 Subject: [PATCH 12/16] rewritten for coverage --- pypdf/_cmap.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 0859aab308..b6530fa59c 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -7,7 +7,6 @@ from .generic import ( DecodedStreamObject, DictionaryObject, - IndirectObject, NullObject, StreamObject, ) @@ -449,34 +448,27 @@ def compute_space_width( en: int = cast(int, ft["/LastChar"]) if st > space_code or en < space_code: raise Exception("Not in range") - if w[space_code - st] == 0: + if w[space_code - st].get_object() == 0: raise Exception("null width") - sp_width = w[space_code - st] + sp_width = w[space_code - st].get_object() except Exception: if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore else: # will consider width of char as avg(width)/2 m = 0 cpt = 0 - for x in w: - if x > 0: - m += x + for xx in w: + xx = xx.get_object() + if xx > 0: + m += xx cpt += 1 sp_width = m / max(1, cpt) / 2 - if isinstance(sp_width, IndirectObject): - # According to - # 'Table 122 - Entries common to all font descriptors (continued)' - # the MissingWidth should be a number, but according to #2286 it can - # be an indirect object - obj = sp_width.get_object() - if obj is None or isinstance(obj, NullObject): - return 0.0 - return obj - + if sp_width is None or isinstance(sp_width, NullObject): + sp_width = 0.0 return sp_width From 0dbb948b483e90f125c720df3498bbb7a2cdd11c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 8 Aug 2024 21:49:42 +0200 Subject: [PATCH 13/16] coverage cmap --- tests/test_cmap.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 9dcfb252d5..69f2931f67 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -1,13 +1,19 @@ """Test the pypdf_cmap module.""" from io import BytesIO +from pathlib import Path import pytest -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from pypdf._cmap import build_char_map +from pypdf.generic import ArrayObject, NameObject, NullObject from . import get_data_from_url +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + @pytest.mark.enable_socket() @pytest.mark.slow() @@ -206,3 +212,13 @@ def test_eten_b5(): """Issue #2356""" reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險") + + +def test_null_missing_width(): + """For coverage of 2792""" + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + page = writer.pages[0] + ft = page["/Resources"]["/Font"]["/F1"] + ft[NameObject("/Widths")] = ArrayObject() + ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject() + page.extract_text() From 75e2c889c41188c7ec14edb43739ef97a8586827 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 9 Aug 2024 08:36:46 +0200 Subject: [PATCH 14/16] coverage --- tests/test_page.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_page.py b/tests/test_page.py index 14e3350cc2..72df648e45 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1438,3 +1438,12 @@ def test_negative_index(): src_abs = RESOURCE_ROOT / "git.pdf" reader = PdfReader(src_abs) assert reader.pages[0] == reader.pages[-1] + + +def test_get_contents_as_bytes(): + writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") + co = writer.pages[0]["/Contents"][0] + expected = co.get_data() + assert writer.pages[0]._get_contents_as_bytes() == expected + writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0] + assert writer.pages[0]._get_contents_as_bytes() == expected From de70ff523124bc572a52e9cf2fbb86cf5b2c0994 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:34:12 +0200 Subject: [PATCH 15/16] fix typo --- pypdf/generic/_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 14549a2310..b5ac6632ac 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -54,9 +54,7 @@ def read_hex_string_from_stream( b"#": ord(b"#"), b"_": ord(b"_"), b"&": ord(b"&"), - b"$": ord( - b"$", - ), + b"$": ord(b"$"), } __BACKSLASH_CODE__ = 92 From bc65c601afff3287649f5056538499f2f1ebaa14 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 9 Aug 2024 11:35:05 +0200 Subject: [PATCH 16/16] remove dev code --- pypdf/_cmap.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index b6530fa59c..d635724d25 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -258,7 +258,6 @@ def prepare_cm(ft: DictionaryObject) -> bytes: cm: bytes if isinstance(tu, StreamObject): cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() - assert isinstance(cm, bytes) elif isinstance(tu, str) and tu.startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"