From 4bd54bd32d525ec41e23960146fbb3b0418a4715 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 28 Jul 2024 17:16:57 +0200
Subject: [PATCH 01/16] DEV: Test against Python 3.13 (#2776)

* DEV: Test against Python 3.13

* fix typo

* add missing setup-python

* fix another typo

* update Pillow version

* attempt to update coverage package

* update number of expected coverage files
---
 .github/workflows/github-ci.yaml | 10 +++++-----
 requirements/ci-3.11.txt         |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
index 820ccdcaa5..1eb3d9bd03 100644
--- a/.github/workflows/github-ci.yaml
+++ b/.github/workflows/github-ci.yaml
@@ -57,7 +57,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
         use-crypto-lib: ["cryptography"]
         include:
           - python-version: "3.7"
@@ -90,7 +90,7 @@ jobs:
         cache-dependency-path: '**/requirements/ci.txt'
     - name: Setup Python (3.11+)
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
       with:
         python-version: ${{ matrix.python-version }}
         allow-prereleases: true
@@ -106,7 +106,7 @@ jobs:
     - name: Install requirements (Python 3.11+)
       run: |
         pip install -r requirements/ci-3.11.txt
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
     - name: Remove pycryptodome and cryptography
       run: |
         pip uninstall pycryptodome cryptography -y
@@ -215,8 +215,8 @@ jobs:
       - name: Check Number of Downloaded Files
         run: |
           downloaded_files_count=$(find \.coverage* -type f | wc -l)
-          if [ $downloaded_files_count -eq 8 ]; then
-            echo "The expected number of files (8) were downloaded."
+          if [ $downloaded_files_count -eq 9 ]; then
+            echo "The expected number of files (9) were downloaded."
           else
             echo "ERROR: Expected 8 files, but found $downloaded_files_count files."
             exit 1
diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
index f382fe2b94..2101771181 100644
--- a/requirements/ci-3.11.txt
+++ b/requirements/ci-3.11.txt
@@ -6,7 +6,7 @@
 #
 attrs==23.1.0
     # via flake8-bugbear
-coverage[toml]==7.3.0
+coverage[toml]==7.6.0
     # via
     #   -r requirements/ci.in
     #   pytest-cov
@@ -35,7 +35,7 @@ mypy-extensions==1.0.0
     # via mypy
 packaging==23.1
     # via pytest
-pillow==10.0.1
+pillow==10.4.0
     # via
     #   -r requirements/ci.in
     #   fpdf2

From d4df20d14cb6a2839c1ab141b51e70652fb3d1f1 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:46:08 +0100
Subject: [PATCH 02/16] STY: Remove boolean value comparison (#2779)

PEP 8 recommendation.
---
 pypdf/annotations/_markup_annotations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py
index 4db8dfdbf0..98a222483b 100644
--- a/pypdf/annotations/_markup_annotations.py
+++ b/pypdf/annotations/_markup_annotations.py
@@ -104,9 +104,9 @@ def __init__(
         self[NameObject("/Rect")] = RectangleObject(rect)
 
         font_str = "font: "
-        if bold is True:
+        if bold:
             font_str = f"{font_str}bold "
-        if italic is True:
+        if italic:
             font_str = f"{font_str}italic "
         font_str = f"{font_str}{font} {font_size}"
         font_str = f"{font_str};text-align:left;color:#{font_color}"

From 3ad9234c2ec08e7cd6a8b2ec962386eda394d76d Mon Sep 17 00:00:00 2001
From: "William G. Gagnon" <wggagnon@poka.io>
Date: Fri, 2 Aug 2024 11:21:53 -0400
Subject: [PATCH 03/16] ROB: Handle images with empty data when processing an
 image from bytes (#2786)

Closes #2783.
---
 CONTRIBUTORS.md                     |  1 +
 pypdf/_xobj_image_helpers.py        |  9 ++++++---
 pypdf/errors.py                     |  4 ++++
 tests/test_xobject_image_helpers.py | 13 +++++++++++--
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 84f0b6ee43..89fec3b14e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)
diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 45b0c145be..5ae8894fa3 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -6,7 +6,7 @@
 
 from ._utils import check_if_whitespace_only, logger_warning
 from .constants import ColorSpaces
-from .errors import PdfReadError
+from .errors import EmptyImageDataError, PdfReadError
 from .generic import (
     ArrayObject,
     DecodedStreamObject,
@@ -148,9 +148,12 @@ def _extended_image_frombytes(
         img = Image.frombytes(mode, size, data)
     except ValueError as exc:
         nb_pix = size[0] * size[1]
-        if len(data) % nb_pix != 0:
+        data_length = len(data)
+        if data_length == 0:
+            raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc
+        if data_length % nb_pix != 0:
             raise exc
-        k = nb_pix * len(mode) / len(data)
+        k = nb_pix * len(mode) / data_length
         data = b"".join([bytes((x,) * int(k)) for x in data])
         img = Image.frombytes(mode, size, data)
     return img
diff --git a/pypdf/errors.py b/pypdf/errors.py
index c962dec662..ad197ffc11 100644
--- a/pypdf/errors.py
+++ b/pypdf/errors.py
@@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError):
     """Raised when a PDF file is empty or has no content."""
 
 
+class EmptyImageDataError(PyPdfError):
+    """Raised when trying to process an image that has no data."""
+
+
 STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py
index 63ecebd9b4..39b7131fcd 100644
--- a/tests/test_xobject_image_helpers.py
+++ b/tests/test_xobject_image_helpers.py
@@ -4,8 +4,8 @@
 import pytest
 
 from pypdf import PdfReader
-from pypdf._xobj_image_helpers import _handle_flate
-from pypdf.errors import PdfReadError
+from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate
+from pypdf.errors import EmptyImageDataError, PdfReadError
 from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject
 
 from . import get_data_from_url
@@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1():
             colors=2,
             obj_as_text="dummy",
         )
+
+
+def test_extended_image_frombytes_zero_data():
+    mode = "RGB"
+    size = (1, 1)
+    data = b""
+
+    with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."):
+        _extended_image_frombytes(mode, size, data)

From 582557e09a7e658fdcb19f26eb069d87875489f0 Mon Sep 17 00:00:00 2001
From: Diogo Teles Sant'Anna <diogoteles@google.com>
Date: Fri, 2 Aug 2024 15:49:29 -0300
Subject: [PATCH 04/16] SEC: Fix GitHub workflow vulnerable to script injection
 (#2787)

Signed-off-by: Diogo Teles Sant'Anna <diogoteles@google.com>
---
 .github/workflows/release.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 9f782ec080..b1a4fb27f3 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
   contents: write
 
+env:
+  HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
   build_and_publish:
     name: Publish a new version
@@ -24,7 +27,7 @@ jobs:
       - name: Extract version from commit message
         id: extract_version
         run: |
-          VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+          VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
       - name: Extract tag message from commit message
@@ -32,7 +35,7 @@ jobs:
         run: |
           VERSION="${{ steps.extract_version.outputs.version }}"
           delimiter="$(openssl rand -hex 8)"
-          MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+          MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
           echo "message<<${delimiter}" >> $GITHUB_OUTPUT
           echo "$MESSAGE" >> $GITHUB_OUTPUT
           echo "${delimiter}" >> $GITHUB_OUTPUT

From 38f3925502c2971ad587fb616500b6f8b6333d03 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:10:47 +0100
Subject: [PATCH 05/16] MAINT: Remove unused paeth_predictor (#2773)

---
 pypdf/_utils.py     | 14 --------------
 tests/test_utils.py | 18 ------------------
 2 files changed, 32 deletions(-)

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 38c0d67d7a..6569707b66 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -390,20 +390,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
 
 
-def paeth_predictor(left: int, up: int, up_left: int) -> int:
-    p = left + up - up_left
-    dist_left = abs(p - left)
-    dist_up = abs(p - up)
-    dist_up_left = abs(p - up_left)
-
-    if dist_left <= dist_up and dist_left <= dist_up_left:
-        return left
-    elif dist_up <= dist_up_left:
-        return up
-    else:
-        return up_left
-
-
 def deprecate(msg: str, stacklevel: int = 3) -> None:
     warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 81fcf9fb47..856bedd863 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -132,24 +132,6 @@ def test_deprecate_no_replacement():
     assert warn[0].message.args[0] == error_msg
 
 
-@pytest.mark.parametrize(
-    ("left", "up", "upleft", "expected"),
-    [
-        (0, 0, 0, 0),
-        (1, 0, 0, 1),
-        (0, 1, 0, 1),
-        (0, 0, 1, 0),
-        (1, 2, 3, 1),
-        (2, 1, 3, 1),
-        (1, 3, 2, 2),
-        (3, 1, 2, 2),
-        (3, 2, 1, 3),
-    ],
-)
-def test_paeth_predictor(left, up, upleft, expected):
-    assert pypdf._utils.paeth_predictor(left, up, upleft) == expected
-
-
 @pytest.mark.parametrize(
     ("dat", "pos", "to_read", "expected", "expected_pos"),
     [

From 09f9b7ed52193bfd9e98bdd018ccaf7cbe821687 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Mon, 5 Aug 2024 16:49:29 +0100
Subject: [PATCH 06/16] MAINT: Remove unused AnnotationFlag

---
 pypdf/annotations/_non_markup_annotations.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py
index dcdb3b0ff8..6272cceee6 100644
--- a/pypdf/annotations/_non_markup_annotations.py
+++ b/pypdf/annotations/_non_markup_annotations.py
@@ -1,6 +1,5 @@
 from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
 
-from ..constants import AnnotationFlag
 from ..generic._base import (
     BooleanObject,
     NameObject,
@@ -12,8 +11,6 @@
 from ..generic._rectangle import RectangleObject
 from ._base import AnnotationDictionary
 
-DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0)
-
 
 class Link(AnnotationDictionary):
     def __init__(

From b2d72043ab5221b58138c7d06c181b8cbc88ea8e Mon Sep 17 00:00:00 2001
From: owurman <oded@acidremap.com>
Date: Mon, 5 Aug 2024 12:14:18 -0700
Subject: [PATCH 07/16] BUG: Handle Sequence as an IndirectObject when
 extracting text with layout mode (#2788)

* Handle Sequence as an IndirectObject

The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility.

* Update pypdf/_text_extraction/_layout_mode/_font.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>

* Address PR comments

-Rename w_1 to w_next_entry
-Utilize ParseError instead of PdfReadError
-Write a test (both positive and negative)

* Handle unlikely case of IndirectObjects for float/int width elements

Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError.

* Yoda condition I removed

* Last commit was a bad patch, confused by non-committed changes

* Use test files from URL rather than resources

* Update tests/test_text_extraction.py

Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>

* Fix code style warnings in range() call

---------

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
---
 pypdf/_text_extraction/_layout_mode/_font.py | 26 +++++++++++++-------
 tests/test_text_extraction.py                | 17 +++++++++++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
index a912fddb27..40655b1b22 100644
--- a/pypdf/_text_extraction/_layout_mode/_font.py
+++ b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -1,8 +1,9 @@
 """Font constants and classes for "layout" mode text operations"""
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, Sequence, Union
+from typing import Any, Dict, Sequence, Union, cast
 
+from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
 
@@ -58,6 +59,7 @@ def __post_init__(self) -> None:
                 skip_count = 0
                 _w = d_font.get("/W", [])
                 for idx, w_entry in enumerate(_w):
+                    w_entry = w_entry.get_object()
                     if skip_count:
                         skip_count -= 1
                         continue
@@ -66,13 +68,14 @@ def __post_init__(self) -> None:
                         # warning and or use reader's "strict" to force an ex???
                         continue
                     # check for format (1): `int [int int int int ...]`
-                    if isinstance(_w[idx + 1], Sequence):
-                        start_idx, width_list = _w[idx : idx + 2]
+                    w_next_entry = _w[idx + 1].get_object()
+                    if isinstance(w_next_entry, Sequence):
+                        start_idx, width_list = w_entry, w_next_entry
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(start_idx, start_idx + len(width_list), 1),
+                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
                                     width_list,
                                 )
                                 if _cidx in ord_map
@@ -80,18 +83,23 @@ def __post_init__(self) -> None:
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    if not isinstance(_w[idx + 1], Sequence) and not isinstance(
-                        _w[idx + 2], Sequence
-                    ):
-                        start_idx, stop_idx, const_width = _w[idx : idx + 3]
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
+                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(start_idx, stop_idx + 1, 1)
+                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 2
+                    else:
+                        # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
+                        # while expecting more elements). This raises an IndexError which is sufficient.
+                        raise ParseError(
+                            f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
+                        )  # pragma: no cover
+
         if not self.width_map and "/BaseFont" in self.font_dictionary:
             for key in STANDARD_WIDTHS:
                 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 1ffa68a3e6..dcd4e6caeb 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -10,6 +10,7 @@
 
 from pypdf import PdfReader, mult
 from pypdf._text_extraction import set_custom_rtl
+from pypdf.errors import ParseError
 
 from . import get_data_from_url
 
@@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths():
         encoding="utf-8"
     )
     assert expected == reader.pages[0].extract_text(extraction_mode="layout")
+
+
+@pytest.mark.enable_socket()
+def test_layout_mode_indirect_sequence_font_widths():
+    # Cover the situation where the sequence for font widths is an IndirectObject
+    # ref https://github.com/py-pdf/pypdf/pull/2788
+    url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
+    name ="2788_example.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].extract_text(extraction_mode="layout") == ""
+    url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
+    name = "2788_example_malformed.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    with pytest.raises(ParseError) as exc:
+        reader.pages[0].extract_text(extraction_mode="layout")
+        assert str(exc.value).startswith("Invalid font width definition")

From 5abd590740a2718fc69b8477c656ce5515a0ab33 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:14:17 +0100
Subject: [PATCH 08/16] STY: Refactor b_ (#2772)

---
 pypdf/_utils.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 6569707b66..5fecb38e7d 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -347,14 +347,11 @@ def b_(s: Union[str, bytes]) -> bytes:
         return bc[s]
     try:
         r = s.encode("latin-1")
-        if len(s) < 2:
-            bc[s] = r
-        return r
-    except Exception:
+    except UnicodeEncodeError:
         r = s.encode("utf-8")
-        if len(s) < 2:
-            bc[s] = r
-        return r
+    if len(s) < 2:
+        bc[s] = r
+    return r
 
 
 def str_(b: Any) -> str:

From 1f83d529fd8ff4dad5c79320d92f87d90b42d1c6 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:22:30 +0200
Subject: [PATCH 09/16] MNT: Remove b_ and str_

closes #2726
superseed proposal #2791
---
 pypdf/_cmap.py                                |   7 +-
 pypdf/_doc_common.py                          |   3 +-
 pypdf/_encryption.py                          |   6 +-
 pypdf/_merger.py                              |  10 ++-
 pypdf/_page.py                                |  16 ++--
 pypdf/_reader.py                              |   5 +-
 pypdf/_utils.py                               |  34 ++-----
 pypdf/_writer.py                              |   6 +-
 pypdf/filters.py                              |  30 +++----
 pypdf/generic/_base.py                        |  43 +++++----
 pypdf/generic/_data_structures.py             |  35 ++++----
 pypdf/generic/_utils.py                       |  83 +++++++++++-------
 ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217096 -> 217093 bytes
 tests/test_page.py                            |   4 +-
 tests/test_utils.py                           |  16 ----
 tests/test_workflows.py                       |   4 +-
 16 files changed, 147 insertions(+), 155 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 9a2d10a611..0859aab308 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
@@ -258,7 +258,8 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     tu = ft["/ToUnicode"]
     cm: bytes
     if isinstance(tu, StreamObject):
-        cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
+        assert isinstance(cm, bytes)
     elif isinstance(tu, str) and tu.startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
@@ -474,7 +475,7 @@ def compute_space_width(
         obj = sp_width.get_object()
         if obj is None or isinstance(obj, NullObject):
             return 0.0
-        return obj  # type: ignore
+        return obj
 
     return sp_width
 
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index d4c5c43c3c..ffbdb7882e 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -49,7 +49,6 @@
 from ._page import PageObject, _VirtualList
 from ._page_labels import index2label as page_index2page_label
 from ._utils import (
-    b_,
     deprecate_with_replacement,
     logger_warning,
     parse_iso8824_date,
@@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
                 if isinstance(f, IndirectObject):
                     field = cast(Optional[EncodedStreamObject], f.get_object())
                     if field:
-                        es = zlib.decompress(b_(field._data))
+                        es = zlib.decompress(field._data)
                         retval[tag] = es
         return retval
 
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
index 5ddd8d0efe..e5cdd9324e 100644
--- a/pypdf/_encryption.py
+++ b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
     rc4_encrypt,
 )
 
-from ._utils import b_, logger_warning
+from ._utils import logger_warning
 from .generic import (
     ArrayObject,
     ByteStringObject,
@@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2.set_data(self.stm_crypt.encrypt(b_(obj._data)))
+            obj2.set_data(self.stm_crypt.encrypt(obj._data))
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj2[key] = self.encrypt_object(value)
             obj = obj2
@@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
             data = self.str_crypt.decrypt(obj.original_bytes)
             obj = create_string_object(data)
         elif isinstance(obj, StreamObject):
-            obj._data = self.stm_crypt.decrypt(b_(obj._data))
+            obj._data = self.stm_crypt.decrypt(obj._data)
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj[key] = self.decrypt_object(value)
         elif isinstance(obj, DictionaryObject):
diff --git a/pypdf/_merger.py b/pypdf/_merger.py
index 7176a1adf7..a077cca7f4 100644
--- a/pypdf/_merger.py
+++ b/pypdf/_merger.py
@@ -46,7 +46,6 @@
 from ._utils import (
     StrByteType,
     deprecate_with_replacement,
-    str_,
 )
 from ._writer import PdfWriter
 from .constants import GoToActionArguments, TypArguments, TypFitArguments
@@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
         self.id = id
 
 
+# transfered from _utils : as this function is only required here
+# and merger will be soon deprecated
+def str_(b: Any) -> str:
+    if isinstance(b, bytes):
+        return b.decode("latin-1")
+    else:
+        return str(b)  # will return b.__str__() if defined
+
+
 class PdfMerger:
     """
     Use :class:`PdfWriter` instead.
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 63038d9d07..8abd66d2f2 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -852,7 +852,7 @@ def _add_transformation_matrix(
                     FloatObject(e),
                     FloatObject(f),
                 ],
-                " cm",
+                b"cm",
             ],
         )
         return contents
@@ -870,7 +870,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]:
             if isinstance(obj, list):
                 return b"".join(x.get_object().get_data() for x in obj)
             else:
-                return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+                return cast(EncodedStreamObject, obj).get_data()
         else:
             return None
 
@@ -1063,11 +1063,11 @@ def _merge_page(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(
@@ -1201,11 +1201,11 @@ def _merge_page_writer(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index aeababa7b7..7c084107c5 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -51,7 +51,6 @@
 from ._utils import (
     StrByteType,
     StreamType,
-    b_,
     logger_warning,
     read_non_whitespace,
     read_previous_line,
@@ -328,7 +327,7 @@ def _get_object_from_stream(
         assert cast(str, obj_stm["/Type"]) == "/ObjStm"
         # /N is the number of indirect objects in the stream
         assert idx < obj_stm["/N"]
-        stream_data = BytesIO(b_(obj_stm.get_data()))
+        stream_data = BytesIO(obj_stm.get_data())
         for i in range(obj_stm["/N"]):  # type: ignore
             read_non_whitespace(stream_data)
             stream_data.seek(-1, 1)
@@ -932,7 +931,7 @@ def _read_pdf15_xref_stream(
         xrefstream = cast(ContentStream, read_object(stream, self))
         assert cast(str, xrefstream["/Type"]) == "/XRef"
         self.cache_indirect_object(generation, idnum, xrefstream)
-        stream_data = BytesIO(b_(xrefstream.get_data()))
+        stream_data = BytesIO(xrefstream.get_data())
         # Index pairs specify the subsections in the dictionary. If
         # none create one subsection that spans everything.
         idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 5fecb38e7d..94d45cf6d7 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -336,31 +336,6 @@ def mark_location(stream: StreamType) -> None:
     stream.seek(-radius, 1)
 
 
-B_CACHE: Dict[str, bytes] = {}
-
-
-def b_(s: Union[str, bytes]) -> bytes:
-    if isinstance(s, bytes):
-        return s
-    bc = B_CACHE
-    if s in bc:
-        return bc[s]
-    try:
-        r = s.encode("latin-1")
-    except UnicodeEncodeError:
-        r = s.encode("utf-8")
-    if len(s) < 2:
-        bc[s] = r
-    return r
-
-
-def str_(b: Any) -> str:
-    if isinstance(b, bytes):
-        return b.decode("latin-1")
-    else:
-        return str(b)  # will return b.__str__() if defined
-
-
 @overload
 def ord_(b: str) -> int:
     ...
@@ -397,12 +372,17 @@ def deprecation(msg: str) -> None:
 
 def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
     """Raise an exception that a feature will be removed, but has a replacement."""
-    deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4)
+    deprecate(
+        f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
+        4,
+    )
 
 
 def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
     """Raise an exception that a feature was already removed, but has a replacement."""
-    deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.")
+    deprecation(
+        f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
+    )
 
 
 def deprecate_no_replacement(name: str, removed_in: str) -> None:
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 00b9d498c0..d73c00e3d4 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -62,7 +62,6 @@
     StrByteType,
     StreamType,
     _get_max_pdf_version_header,
-    b_,
     deprecate_with_replacement,
     logger_warning,
 )
@@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
         # Hello world!
         # endstream
         # endobj
-
+        if isinstance(data, str):
+            data = data.encode("latin-1")
         file_entry = DecodedStreamObject()
-        file_entry.set_data(b_(data))
+        file_entry.set_data(data)
         file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
 
         # The Filespec entry
diff --git a/pypdf/filters.py b/pypdf/filters.py
index 137e3603a3..d14b765801 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -43,7 +43,6 @@
 
 from ._utils import (
     WHITESPACES_AS_BYTES,
-    b_,
     deprecate_with_replacement,
     deprecation_no_replacement,
     logger_warning,
@@ -376,20 +375,18 @@ class LZWDecode:
     """
     Taken from:
 
-    http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-
-    Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
+    http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
     """
 
     class Decoder:
+        STOP = 257
+        CLEARDICT = 256
+
         def __init__(self, data: bytes) -> None:
-            self.STOP = 257
-            self.CLEARDICT = 256
             self.data = data
             self.bytepos = 0
             self.bitpos = 0
-            self.dict = [""] * 4096
-            for i in range(256):
-                self.dict[i] = chr(i)
+            self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
             self.reset_dict()
 
         def reset_dict(self) -> None:
@@ -416,7 +413,7 @@ def next_code(self) -> int:
                     self.bytepos = self.bytepos + 1
             return value
 
-        def decode(self) -> str:
+        def decode(self) -> bytes:
             """
             TIFF 6.0 specification explains in sufficient details the steps to
             implement the LZW encode() and decode() algorithms.
@@ -429,7 +426,7 @@ def decode(self) -> str:
               PdfReadError: If the stop code is missing
             """
             cW = self.CLEARDICT
-            baos = ""
+            baos = b""
             while True:
                 pW = cW
                 cW = self.next_code()
@@ -444,11 +441,11 @@ def decode(self) -> str:
                 else:
                     if cW < self.dictlen:
                         baos += self.dict[cW]
-                        p = self.dict[pW] + self.dict[cW][0]
+                        p = self.dict[pW] + self.dict[cW][0:1]
                         self.dict[self.dictlen] = p
                         self.dictlen += 1
                     else:
-                        p = self.dict[pW] + self.dict[pW][0]
+                        p = self.dict[pW] + self.dict[pW][0:1]
                         baos += p
                         self.dict[self.dictlen] = p
                         self.dictlen += 1
@@ -464,7 +461,7 @@ def decode(
         data: bytes,
         decode_parms: Optional[DictionaryObject] = None,
         **kwargs: Any,
-    ) -> str:
+    ) -> bytes:
         """
         Decode an LZW encoded data stream.
 
@@ -476,7 +473,6 @@ def decode(
           decoded data.
         """
         # decode_parms is unused here
-
         return LZWDecode.Decoder(data).decode()
 
 
@@ -651,7 +647,7 @@ def decode(
         return tiff_header + data
 
 
-def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
+def decode_stream_data(stream: Any) -> bytes:  # utils.StreamObject
     """
     Decode the stream data based on the specified filters.
 
@@ -678,7 +674,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
     decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
     if not isinstance(decodparms, (list, tuple)):
         decodparms = (decodparms,)
-    data: bytes = b_(stream._data)
+    data: bytes = stream._data
     # If there is not data to decode we should not try to decode the data.
     if data:
         for filter_type, params in zip(filters, decodparms):
@@ -691,7 +687,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
             elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
                 data = RunLengthDecode.decode(data)
             elif filter_type in (FT.LZW_DECODE, FTA.LZW):
-                data = LZWDecode.decode(data, params)  # type: ignore
+                data = LZWDecode.decode(data, params)
             elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
                 data = ASCII85Decode.decode(data)
             elif filter_type == FT.DCT_DECODE:
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index 2d606b4184..4e1cf1bca0 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -30,18 +30,17 @@
 import re
 from binascii import unhexlify
 from math import log10
+from struct import iter_unpack
 from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast
 
 from .._codecs import _pdfdoc_encoding_rev
 from .._protocols import PdfObjectProtocol, PdfWriterProtocol
 from .._utils import (
     StreamType,
-    b_,
     deprecate_no_replacement,
     logger_warning,
     read_non_whitespace,
     read_until_regex,
-    str_,
 )
 from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
 
@@ -308,6 +307,14 @@ def __getitem__(self, key: Any) -> Any:
         # items should be extracted from pointed Object
         return self._get_object_with_check()[key]  # type: ignore
 
+    def __float__(self) -> str:
+        # in this case we are looking for the pointed data
+        return self.get_object().__float__()  # type: ignore
+
+    def __int__(self) -> str:
+        # in this case we are looking for the pointed data
+        return self.get_object().__int__()  # type: ignore
+
     def __str__(self) -> str:
         # in this case we are looking for the pointed data
         return self.get_object().__str__()
@@ -369,10 +376,10 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject":  # PdfRe
 
 class FloatObject(float, PdfObject):
     def __new__(
-        cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None
+        cls, value: Any = "0.0", context: Optional[Any] = None
     ) -> "FloatObject":
         try:
-            value = float(str_(value))
+            value = float(value)
             return float.__new__(cls, value)
         except Exception as e:
             # If this isn't a valid decimal (happens in malformed PDFs)
@@ -599,15 +606,16 @@ def write_to_stream(
             )
         bytearr = self.get_encoded_bytes()
         stream.write(b"(")
-        for c in bytearr:
-            if not chr(c).isalnum() and c != b" ":
+        for c_ in iter_unpack("c", bytearr):
+            c = cast(bytes, c_[0])
+            if not c.isalnum() and c != b" ":
                 # This:
                 #   stream.write(rf"\{c:0>3o}".encode())
                 # gives
                 #   https://github.com/davidhalter/parso/issues/207
-                stream.write(("\\%03o" % c).encode())
+                stream.write(b"\\%03o" % ord(c))
             else:
-                stream.write(b_(chr(c)))
+                stream.write(c)
         stream.write(b")")
 
 
@@ -710,12 +718,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject":  # PdfReader
 
 
 def encode_pdfdocencoding(unicode_string: str) -> bytes:
-    retval = bytearray()
-    for c in unicode_string:
-        try:
-            retval += b_(chr(_pdfdoc_encoding_rev[c]))
-        except KeyError:
-            raise UnicodeEncodeError(
-                "pdfdocencoding", c, -1, -1, "does not exist in translation table"
-            )
-    return bytes(retval)
+    try:
+        return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
+    except KeyError:
+        raise UnicodeEncodeError(
+            "pdfdocencoding",
+            unicode_string,
+            -1,
+            -1,
+            "does not exist in translation table",
+        )
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 87d6886742..399836be5f 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -52,7 +52,6 @@
 from .._utils import (
     WHITESPACES,
     StreamType,
-    b_,
     deprecate_no_replacement,
     deprecate_with_replacement,
     logger_warning,
@@ -843,7 +842,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None:
 
 class StreamObject(DictionaryObject):
     def __init__(self) -> None:
-        self._data: Union[bytes, str] = b""
+        self._data: bytes = b""
         self.decoded_self: Optional[DecodedStreamObject] = None
 
     def _clone(
@@ -877,7 +876,7 @@ def _clone(
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited)
 
-    def get_data(self) -> Union[bytes, str]:
+    def get_data(self) -> bytes:
         return self._data
 
     def set_data(self, data: bytes) -> None:
@@ -885,7 +884,7 @@ def set_data(self, data: bytes) -> None:
 
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
-        data += b_(self._data)
+        data += self._data
         return data
 
     def write_to_stream(
@@ -955,7 +954,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
         retval[NameObject(SA.FILTER)] = f
         if params is not None:
             retval[NameObject(SA.DECODE_PARMS)] = params
-        retval._data = FlateDecode.encode(b_(self._data), level)
+        retval._data = FlateDecode.encode(self._data, level)
         return retval
 
     def decode_as_image(self) -> Any:
@@ -993,7 +992,7 @@ def __init__(self) -> None:
         self.decoded_self: Optional[DecodedStreamObject] = None
 
     # This overrides the parent method:
-    def get_data(self) -> Union[bytes, str]:
+    def get_data(self) -> bytes:
         from ..filters import decode_stream_data
 
         if self.decoded_self is not None:
@@ -1003,7 +1002,7 @@ def get_data(self) -> Union[bytes, str]:
             # create decoded object
             decoded = DecodedStreamObject()
 
-            decoded.set_data(b_(decode_stream_data(self)))
+            decoded.set_data(decode_stream_data(self))
             for key, value in list(self.items()):
                 if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS):
                     decoded[key] = value
@@ -1058,7 +1057,7 @@ def __init__(
         # The inner list has two elements:
         #  Element 0: List
         #  Element 1: str
-        self._operations: List[Tuple[Any, Any]] = []
+        self._operations: List[Tuple[Any, bytes]] = []
 
         # stream may be a StreamObject or an ArrayObject containing
         # multiple StreamObjects to be cat'd together.
@@ -1069,14 +1068,14 @@ def __init__(
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
-                    data += b_(s.get_object().get_data())
+                    data += s.get_object().get_data()
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
                 super().set_data(bytes(data))
             else:
                 stream_data = stream.get_data()
                 assert stream_data is not None
-                super().set_data(b_(stream_data))
+                super().set_data(stream_data)
             self.forced_encoding = forced_encoding
 
     def clone(
@@ -1132,7 +1131,7 @@ def _clone(
             ignore_fields:
         """
         src_cs = cast("ContentStream", src)
-        super().set_data(b_(src_cs._data))
+        super().set_data(src_cs._data)
         self.pdf = pdf_dest
         self._operations = list(src_cs._operations)
         self.forced_encoding = src_cs.forced_encoding
@@ -1249,10 +1248,10 @@ def get_data(self) -> bytes:
                     for op in operands:
                         op.write_to_stream(new_data)
                         new_data.write(b" ")
-                    new_data.write(b_(operator))
+                    new_data.write(operator)
                 new_data.write(b"\n")
             self._data = new_data.getvalue()
-        return b_(self._data)
+        return self._data
 
     # This overrides the parent method:
     def set_data(self, data: bytes) -> None:
@@ -1262,21 +1261,21 @@ def set_data(self, data: bytes) -> None:
     @property
     def operations(self) -> List[Tuple[Any, Any]]:
         if not self._operations and self._data:
-            self._parse_content_stream(BytesIO(b_(self._data)))
+            self._parse_content_stream(BytesIO(self._data))
             self._data = b""
         return self._operations
 
     @operations.setter
-    def operations(self, operations: List[Tuple[Any, Any]]) -> None:
+    def operations(self, operations: List[Tuple[Any, bytes]]) -> None:
         self._operations = operations
         self._data = b""
 
     def isolate_graphics_state(self) -> None:
         if self._operations:
-            self._operations.insert(0, ([], "q"))
-            self._operations.append(([], "Q"))
+            self._operations.insert(0, ([], b"q"))
+            self._operations.append(([], b"Q"))
         elif self._data:
-            self._data = b"q\n" + b_(self._data) + b"\nQ\n"
+            self._data = b"q\n" + self._data + b"\nQ\n"
 
     # This overrides the parent method:
     def write_to_stream(
diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
index fdcdc33399..5d47a8b5aa 100644
--- a/pypdf/generic/_utils.py
+++ b/pypdf/generic/_utils.py
@@ -2,7 +2,7 @@
 from typing import Dict, List, Tuple, Union
 
 from .._codecs import _pdfdoc_encoding
-from .._utils import StreamType, b_, logger_warning, read_non_whitespace
+from .._utils import StreamType, logger_warning, read_non_whitespace
 from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
 from ._base import ByteStringObject, TextStringObject
 
@@ -16,7 +16,7 @@ def read_hex_string_from_stream(
     forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
 ) -> Union["TextStringObject", "ByteStringObject"]:
     stream.read(1)
-    txt = ""
+    arr = []
     x = b""
     while True:
         tok = read_non_whitespace(stream)
@@ -26,13 +26,40 @@ def read_hex_string_from_stream(
             break
         x += tok
         if len(x) == 2:
-            txt += chr(int(x, base=16))
+            arr.append(int(x, base=16))
             x = b""
     if len(x) == 1:
         x += b"0"
-    if len(x) == 2:
-        txt += chr(int(x, base=16))
-    return create_string_object(b_(txt), forced_encoding)
+    if x != b"":
+        arr.append(int(x, base=16))
+    return create_string_object(bytes(arr), forced_encoding)
+
+
+__ESPACE_DICT__ = {
+    b"n": ord(b"\n"),
+    b"r": ord(b"\r"),
+    b"t": ord(b"\t"),
+    b"b": ord(b"\b"),
+    b"f": ord(b"\f"),
+    # b"c": ord(rb"\c"),
+    b"(": ord(b"("),
+    b")": ord(b")"),
+    b"/": ord(b"/"),
+    b"\\": ord(b"\\"),
+    b" ": ord(b" "),
+    b"%": ord(b"%"),
+    b"<": ord(b"<"),
+    b">": ord(b">"),
+    b"[": ord(b"["),
+    b"]": ord(b"]"),
+    b"#": ord(b"#"),
+    b"_": ord(b"_"),
+    b"&": ord(b"&"),
+    b"$": ord(
+        b"$",
+    ),
+}
+__BACKSLASH_CODE__ = 92
 
 
 def read_string_from_stream(
@@ -54,30 +81,12 @@ def read_string_from_stream(
                 break
         elif tok == b"\\":
             tok = stream.read(1)
-            escape_dict = {
-                b"n": b"\n",
-                b"r": b"\r",
-                b"t": b"\t",
-                b"b": b"\b",
-                b"f": b"\f",
-                b"c": rb"\c",
-                b"(": b"(",
-                b")": b")",
-                b"/": b"/",
-                b"\\": b"\\",
-                b" ": b" ",
-                b"%": b"%",
-                b"<": b"<",
-                b">": b">",
-                b"[": b"[",
-                b"]": b"]",
-                b"#": b"#",
-                b"_": b"_",
-                b"&": b"&",
-                b"$": b"$",
-            }
             try:
-                tok = escape_dict[tok]
+                if tok == b"c":
+                    txt.extend([__BACKSLASH_CODE__, 99])
+                else:
+                    txt.append(__ESPACE_DICT__[tok])
+                continue
             except KeyError:
                 if b"0" <= tok <= b"7":
                     # "The number ddd may consist of one, two, or three
@@ -85,6 +94,7 @@ def read_string_from_stream(
                     # Three octal digits shall be used, with leading zeros
                     # as needed, if the next character of the string is also
                     # a digit." (PDF reference 7.3.4.2, p 16)
+                    sav = stream.tell() - 1
                     for _ in range(2):
                         ntok = stream.read(1)
                         if b"0" <= ntok <= b"7":
@@ -92,7 +102,13 @@ def read_string_from_stream(
                         else:
                             stream.seek(-1, 1)  # ntok has to be analyzed
                             break
-                    tok = b_(chr(int(tok, base=8)))
+                    i = int(tok, base=8)
+                    if i > 255:
+                        txt.append(__BACKSLASH_CODE__)
+                        stream.seek(sav)
+                    else:
+                        txt.append(i)
+                    continue
                 elif tok in b"\n\r":
                     # This case is  hit when a backslash followed by a line
                     # break occurs. If it's a multi-char EOL, consume the
@@ -102,12 +118,13 @@ def read_string_from_stream(
                         stream.seek(-1, 1)
                     # Then don't add anything to the actual string, since this
                     # line break was escaped:
-                    tok = b""
+                    continue
                 else:
                     msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}"
                     logger_warning(msg, __name__)
-        txt.append(tok)
-    return create_string_object(b"".join(txt), forced_encoding)
+                    txt.append(__BACKSLASH_CODE__)
+        txt.append(ord(tok))
+    return create_string_object(bytes(txt), forced_encoding)
 
 
 def create_string_object(
diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf
index 0e9633ac16c138eeaa90d3cf13e9f7cd6e2c006d..a53f28f0be432c38a1fff33672a2170eeb5f553f 100644
GIT binary patch
delta 1135
zcmZuwJ5B>Z3>Bn7L8-XFHtGEtkL~mzB#PXGI^7lsK}(f3N1%gya3N-+5Rx~UZB|eA
z_<cO@yT$8n@mV)l!^>*(w%S~Mtv1*EGs-~I8J>sv{zoZ`BgRCDn2^;DWAA#+qhOsy
z9_nuY2iNV#)9h}noP_mNYj;O2#kxAbfL+j&#EQMi$qgIN%*aCBkgS9v3tOTYrFns*
zi?DXRqIyy{%_l0Z{g4pthD0$;#wiU+yHv)c!>vmue00SNBRm4^N)hR?Vv~jK{H4mw
ziIFg?VZ<yRz8BZ@ADbme@j^*L)~rq4iG>q^h6P&9s+GXO#zoHdee$2JJpxz|Hl_r{
zY(@&UHcToGu}TFAn%96`U{@PYmtn8OwFX4O-WCKhW)JHKw%|G;shLP7tsbKDqy5qZ
a<FglL!lx)aecSK#?D0nD%jNz0Vg3t(%-ppA

delta 1030
zcmZuvJxYT?6ePw%0ujUuq)B1r`+xfiOGWS}*n0-+bU{S4wc0k`Knt<<44%N7xKSeH
zty_J(Z+2$h%-%1@kIV6k&9|>X;Lhg{J|DdMd<#DhbONh0xtWZ9J{UqF&k&I6)7fzU
zDDp4D(!te>XPgE={aR6UiiW`%ZfqTeiIj@xjiPk$e@q9UWn5rLy=@eWjv@^6B7z!u
zVL|uoObDgBOUxb4d9AVd3tS3uCsM>HdG}UKHtl4X<1DUS`o&!~`3?a}@xUY!MBL9r
zrG8xWxg)aX>X2IbX&<BlEOqa)<(@J@ZY%V&lJi7ngG|dx`K_WtNU9wb+XMdHba$OQ
zssd|$U3tp9Y9QH@x<%?3oi%sdW)_=K1G+e;>d=ezp1ic;>(#79o%KUaWwR|p2uxbk
YRyD8Bjbb46Z8kjJvC(vTJUbbE19m{nQvd(}

diff --git a/tests/test_page.py b/tests/test_page.py
index cb7b6c723f..14e3350cc2 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1131,9 +1131,9 @@ def test_merge_page_resources_smoke_test():
     # use these keys for some "operations", to validate renaming
     # (the operand name doesn't matter)
     contents1 = page1[NO("/Contents")] = ContentStream(None, None)
-    contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")]
+    contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")]
     contents2 = page2[NO("/Contents")] = ContentStream(None, None)
-    contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")]
+    contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")]
 
     expected_properties = {
         "/just1": "/just1-value",
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 856bedd863..a4ddff8831 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -109,22 +109,6 @@ def test_mark_location():
     Path("pypdf_pdfLocation.txt").unlink()  # cleanup
 
 
-@pytest.mark.parametrize(
-    ("input_str", "expected"),
-    [
-        ("foo", b"foo"),
-        ("😀", "😀".encode()),
-        ("‰", "‰".encode()),
-        ("▷", "▷".encode()),
-        ("世", "世".encode()),
-        # A multi-character string example with non-latin-1 characters:
-        ("😀😃", "😀😃".encode()),
-    ],
-)
-def test_b(input_str: str, expected: bytes):
-    assert pypdf._utils.b_(input_str) == expected
-
-
 def test_deprecate_no_replacement():
     with pytest.warns(DeprecationWarning) as warn:
         pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0")
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index 93bc0c9e5e..4407b8fd5e 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -391,11 +391,11 @@ def test_merge(tmp_path, url, name):
             {
                 "/Author": "Unknown",
                 "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM",
-                "/Creator": "C:DEBÆł8",
+                "/Creator": r"C:\DEB\6338",
                 "/Keywords": "",
                 "/Producer": "Acrobat PDFWriter 3.02 for Windows",
                 "/Subject": "",
-                "/Title": "C:DEBÆł8-6R.PDF",
+                "/Title": r"C:\DEB\6338-6R.PDF",
             },
         )
     ],

From 0befa15427af68b9f22b3e50bdd0c61ee82de19c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:17:52 +0200
Subject: [PATCH 10/16] coverage

---
 pypdf/_merger.py        | 2 +-
 pypdf/generic/_utils.py | 6 +-----
 tests/test_generic.py   | 4 ++++
 tests/test_writer.py    | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pypdf/_merger.py b/pypdf/_merger.py
index a077cca7f4..a52a354e38 100644
--- a/pypdf/_merger.py
+++ b/pypdf/_merger.py
@@ -83,7 +83,7 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
 
 # transfered from _utils : as this function is only required here
 # and merger will be soon deprecated
-def str_(b: Any) -> str:
+def str_(b: Any) -> str:  # pragma: no cover
     if isinstance(b, bytes):
         return b.decode("latin-1")
     else:
diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
index 5d47a8b5aa..14549a2310 100644
--- a/pypdf/generic/_utils.py
+++ b/pypdf/generic/_utils.py
@@ -41,7 +41,6 @@ def read_hex_string_from_stream(
     b"t": ord(b"\t"),
     b"b": ord(b"\b"),
     b"f": ord(b"\f"),
-    # b"c": ord(rb"\c"),
     b"(": ord(b"("),
     b")": ord(b")"),
     b"/": ord(b"/"),
@@ -82,10 +81,7 @@ def read_string_from_stream(
         elif tok == b"\\":
             tok = stream.read(1)
             try:
-                if tok == b"c":
-                    txt.extend([__BACKSLASH_CODE__, 99])
-                else:
-                    txt.append(__ESPACE_DICT__[tok])
+                txt.append(__ESPACE_DICT__[tok])
                 continue
             except KeyError:
                 if b"0" <= tok <= b"7":
diff --git a/tests/test_generic.py b/tests/test_generic.py
index b1079974ef..2c28730b0a 100644
--- a/tests/test_generic.py
+++ b/tests/test_generic.py
@@ -1227,6 +1227,10 @@ def test_cloning(caplog):
     assert "/Test" in obj21
     assert isinstance(obj21.get("/Test2"), IndirectObject)
 
+    obj30 = TextStringObject("123")
+    obj31 = obj30.clone(writer, ignore_fields=None)
+    assert int(obj31) == 123
+
 
 @pytest.mark.enable_socket()
 def test_append_with_indirectobject_not_pointing(caplog):
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 9dfeffdd89..84d84d0db6 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1290,7 +1290,7 @@ def test_attachments():
     to_add = [
         ("foobar.txt", b"foobarcontent"),
         ("foobar2.txt", b"foobarcontent2"),
-        ("foobar2.txt", b"2nd_foobarcontent"),
+        ("foobar2.txt", "2nd_foobarcontent"),
     ]
     for name, content in to_add:
         writer.add_attachment(name, content)

From 97ff7e1d1082bdb8b375365b5042f9b528a3a2c9 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 17:13:27 +0200
Subject: [PATCH 11/16] revert

---
 pypdf/generic/_base.py | 4 ----
 tests/test_generic.py  | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index 4e1cf1bca0..309d389cc8 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -311,10 +311,6 @@ def __float__(self) -> str:
         # in this case we are looking for the pointed data
         return self.get_object().__float__()  # type: ignore
 
-    def __int__(self) -> str:
-        # in this case we are looking for the pointed data
-        return self.get_object().__int__()  # type: ignore
-
     def __str__(self) -> str:
         # in this case we are looking for the pointed data
         return self.get_object().__str__()
diff --git a/tests/test_generic.py b/tests/test_generic.py
index 2c28730b0a..b1079974ef 100644
--- a/tests/test_generic.py
+++ b/tests/test_generic.py
@@ -1227,10 +1227,6 @@ def test_cloning(caplog):
     assert "/Test" in obj21
     assert isinstance(obj21.get("/Test2"), IndirectObject)
 
-    obj30 = TextStringObject("123")
-    obj31 = obj30.clone(writer, ignore_fields=None)
-    assert int(obj31) == 123
-
 
 @pytest.mark.enable_socket()
 def test_append_with_indirectobject_not_pointing(caplog):

From 7adabad4fb39e5e3c584a27c7d1459bd5fdc110f Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:59:11 +0200
Subject: [PATCH 12/16] rewritten for coverage

---
 pypdf/_cmap.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 0859aab308..b6530fa59c 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -7,7 +7,6 @@
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    IndirectObject,
     NullObject,
     StreamObject,
 )
@@ -449,34 +448,27 @@ def compute_space_width(
             en: int = cast(int, ft["/LastChar"])
             if st > space_code or en < space_code:
                 raise Exception("Not in range")
-            if w[space_code - st] == 0:
+            if w[space_code - st].get_object() == 0:
                 raise Exception("null width")
-            sp_width = w[space_code - st]
+            sp_width = w[space_code - st].get_object()
         except Exception:
             if "/FontDescriptor" in ft and "/MissingWidth" in cast(
                 DictionaryObject, ft["/FontDescriptor"]
             ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
+                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
             else:
                 # will consider width of char as avg(width)/2
                 m = 0
                 cpt = 0
-                for x in w:
-                    if x > 0:
-                        m += x
+                for xx in w:
+                    xx = xx.get_object()
+                    if xx > 0:
+                        m += xx
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
 
-    if isinstance(sp_width, IndirectObject):
-        # According to
-        # 'Table 122 - Entries common to all font descriptors (continued)'
-        # the MissingWidth should be a number, but according to #2286 it can
-        # be an indirect object
-        obj = sp_width.get_object()
-        if obj is None or isinstance(obj, NullObject):
-            return 0.0
-        return obj
-
+    if sp_width is None or isinstance(sp_width, NullObject):
+        sp_width = 0.0
     return sp_width
 
 

From 0dbb948b483e90f125c720df3498bbb7a2cdd11c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 21:49:42 +0200
Subject: [PATCH 13/16] coverage cmap

---
 tests/test_cmap.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
index 9dcfb252d5..69f2931f67 100644
--- a/tests/test_cmap.py
+++ b/tests/test_cmap.py
@@ -1,13 +1,19 @@
 """Test the pypdf_cmap module."""
 from io import BytesIO
+from pathlib import Path
 
 import pytest
 
-from pypdf import PdfReader
+from pypdf import PdfReader, PdfWriter
 from pypdf._cmap import build_char_map
+from pypdf.generic import ArrayObject, NameObject, NullObject
 
 from . import get_data_from_url
 
+TESTS_ROOT = Path(__file__).parent.resolve()
+PROJECT_ROOT = TESTS_ROOT.parent
+RESOURCE_ROOT = PROJECT_ROOT / "resources"
+
 
 @pytest.mark.enable_socket()
 @pytest.mark.slow()
@@ -206,3 +212,13 @@ def test_eten_b5():
     """Issue #2356"""
     reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
     reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")
+
+
+def test_null_missing_width():
+    """For coverage of 2792"""
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
+    page = writer.pages[0]
+    ft = page["/Resources"]["/Font"]["/F1"]
+    ft[NameObject("/Widths")] = ArrayObject()
+    ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
+    page.extract_text()

From 75e2c889c41188c7ec14edb43739ef97a8586827 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Fri, 9 Aug 2024 08:36:46 +0200
Subject: [PATCH 14/16] coverage

---
 tests/test_page.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_page.py b/tests/test_page.py
index 14e3350cc2..72df648e45 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1438,3 +1438,12 @@ def test_negative_index():
     src_abs = RESOURCE_ROOT / "git.pdf"
     reader = PdfReader(src_abs)
     assert reader.pages[0] == reader.pages[-1]
+
+
+def test_get_contents_as_bytes():
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
+    co = writer.pages[0]["/Contents"][0]
+    expected = co.get_data()
+    assert writer.pages[0]._get_contents_as_bytes() == expected
+    writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0]
+    assert writer.pages[0]._get_contents_as_bytes() == expected

From de70ff523124bc572a52e9cf2fbb86cf5b2c0994 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Fri, 9 Aug 2024 11:34:12 +0200
Subject: [PATCH 15/16] fix typo

---
 pypdf/generic/_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
index 14549a2310..b5ac6632ac 100644
--- a/pypdf/generic/_utils.py
+++ b/pypdf/generic/_utils.py
@@ -54,9 +54,7 @@ def read_hex_string_from_stream(
     b"#": ord(b"#"),
     b"_": ord(b"_"),
     b"&": ord(b"&"),
-    b"$": ord(
-        b"$",
-    ),
+    b"$": ord(b"$"),
 }
 __BACKSLASH_CODE__ = 92
 

From bc65c601afff3287649f5056538499f2f1ebaa14 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Fri, 9 Aug 2024 11:35:05 +0200
Subject: [PATCH 16/16] remove dev code

---
 pypdf/_cmap.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index b6530fa59c..d635724d25 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -258,7 +258,6 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     cm: bytes
     if isinstance(tu, StreamObject):
         cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
-        assert isinstance(cm, bytes)
     elif isinstance(tu, str) and tu.startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"