From 4bd54bd32d525ec41e23960146fbb3b0418a4715 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 28 Jul 2024 17:16:57 +0200
Subject: [PATCH 01/26] DEV: Test against Python 3.13 (#2776)

* DEV: Test against Python 3.13

* fix typo

* add missing setup-python

* fix another typo

* update Pillow version

* attempt to update coverage package

* update number of expected coverage files
---
 .github/workflows/github-ci.yaml | 10 +++++-----
 requirements/ci-3.11.txt         |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
index 820ccdcaa5..1eb3d9bd03 100644
--- a/.github/workflows/github-ci.yaml
+++ b/.github/workflows/github-ci.yaml
@@ -57,7 +57,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
         use-crypto-lib: ["cryptography"]
         include:
           - python-version: "3.7"
@@ -90,7 +90,7 @@ jobs:
         cache-dependency-path: '**/requirements/ci.txt'
     - name: Setup Python (3.11+)
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
       with:
         python-version: ${{ matrix.python-version }}
         allow-prereleases: true
@@ -106,7 +106,7 @@ jobs:
     - name: Install requirements (Python 3.11+)
       run: |
         pip install -r requirements/ci-3.11.txt
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
     - name: Remove pycryptodome and cryptography
       run: |
         pip uninstall pycryptodome cryptography -y
@@ -215,8 +215,8 @@ jobs:
       - name: Check Number of Downloaded Files
         run: |
           downloaded_files_count=$(find \.coverage* -type f | wc -l)
-          if [ $downloaded_files_count -eq 8 ]; then
-            echo "The expected number of files (8) were downloaded."
+          if [ $downloaded_files_count -eq 9 ]; then
+            echo "The expected number of files (9) were downloaded."
           else
             echo "ERROR: Expected 8 files, but found $downloaded_files_count files."
             exit 1
diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
index f382fe2b94..2101771181 100644
--- a/requirements/ci-3.11.txt
+++ b/requirements/ci-3.11.txt
@@ -6,7 +6,7 @@
 #
 attrs==23.1.0
     # via flake8-bugbear
-coverage[toml]==7.3.0
+coverage[toml]==7.6.0
     # via
     #   -r requirements/ci.in
     #   pytest-cov
@@ -35,7 +35,7 @@ mypy-extensions==1.0.0
     # via mypy
 packaging==23.1
     # via pytest
-pillow==10.0.1
+pillow==10.4.0
     # via
     #   -r requirements/ci.in
     #   fpdf2

From d4df20d14cb6a2839c1ab141b51e70652fb3d1f1 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:46:08 +0100
Subject: [PATCH 02/26] STY: Remove boolean value comparison (#2779)

PEP 8 recommendation.
---
 pypdf/annotations/_markup_annotations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py
index 4db8dfdbf0..98a222483b 100644
--- a/pypdf/annotations/_markup_annotations.py
+++ b/pypdf/annotations/_markup_annotations.py
@@ -104,9 +104,9 @@ def __init__(
         self[NameObject("/Rect")] = RectangleObject(rect)
 
         font_str = "font: "
-        if bold is True:
+        if bold:
             font_str = f"{font_str}bold "
-        if italic is True:
+        if italic:
             font_str = f"{font_str}italic "
         font_str = f"{font_str}{font} {font_size}"
         font_str = f"{font_str};text-align:left;color:#{font_color}"

From 3ad9234c2ec08e7cd6a8b2ec962386eda394d76d Mon Sep 17 00:00:00 2001
From: "William G. Gagnon" <wggagnon@poka.io>
Date: Fri, 2 Aug 2024 11:21:53 -0400
Subject: [PATCH 03/26] ROB: Handle images with empty data when processing an
 image from bytes (#2786)

Closes #2783.
---
 CONTRIBUTORS.md                     |  1 +
 pypdf/_xobj_image_helpers.py        |  9 ++++++---
 pypdf/errors.py                     |  4 ++++
 tests/test_xobject_image_helpers.py | 13 +++++++++++--
 4 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 84f0b6ee43..89fec3b14e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)
diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 45b0c145be..5ae8894fa3 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -6,7 +6,7 @@
 
 from ._utils import check_if_whitespace_only, logger_warning
 from .constants import ColorSpaces
-from .errors import PdfReadError
+from .errors import EmptyImageDataError, PdfReadError
 from .generic import (
     ArrayObject,
     DecodedStreamObject,
@@ -148,9 +148,12 @@ def _extended_image_frombytes(
         img = Image.frombytes(mode, size, data)
     except ValueError as exc:
         nb_pix = size[0] * size[1]
-        if len(data) % nb_pix != 0:
+        data_length = len(data)
+        if data_length == 0:
+            raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc
+        if data_length % nb_pix != 0:
             raise exc
-        k = nb_pix * len(mode) / len(data)
+        k = nb_pix * len(mode) / data_length
         data = b"".join([bytes((x,) * int(k)) for x in data])
         img = Image.frombytes(mode, size, data)
     return img
diff --git a/pypdf/errors.py b/pypdf/errors.py
index c962dec662..ad197ffc11 100644
--- a/pypdf/errors.py
+++ b/pypdf/errors.py
@@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError):
     """Raised when a PDF file is empty or has no content."""
 
 
+class EmptyImageDataError(PyPdfError):
+    """Raised when trying to process an image that has no data."""
+
+
 STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py
index 63ecebd9b4..39b7131fcd 100644
--- a/tests/test_xobject_image_helpers.py
+++ b/tests/test_xobject_image_helpers.py
@@ -4,8 +4,8 @@
 import pytest
 
 from pypdf import PdfReader
-from pypdf._xobj_image_helpers import _handle_flate
-from pypdf.errors import PdfReadError
+from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate
+from pypdf.errors import EmptyImageDataError, PdfReadError
 from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject
 
 from . import get_data_from_url
@@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1():
             colors=2,
             obj_as_text="dummy",
         )
+
+
+def test_extended_image_frombytes_zero_data():
+    mode = "RGB"
+    size = (1, 1)
+    data = b""
+
+    with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."):
+        _extended_image_frombytes(mode, size, data)

From 582557e09a7e658fdcb19f26eb069d87875489f0 Mon Sep 17 00:00:00 2001
From: Diogo Teles Sant'Anna <diogoteles@google.com>
Date: Fri, 2 Aug 2024 15:49:29 -0300
Subject: [PATCH 04/26] SEC: Fix GitHub workflow vulnerable to script injection
 (#2787)

Signed-off-by: Diogo Teles Sant'Anna <diogoteles@google.com>
---
 .github/workflows/release.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 9f782ec080..b1a4fb27f3 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
   contents: write
 
+env:
+  HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
   build_and_publish:
     name: Publish a new version
@@ -24,7 +27,7 @@ jobs:
       - name: Extract version from commit message
         id: extract_version
         run: |
-          VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+          VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
       - name: Extract tag message from commit message
@@ -32,7 +35,7 @@ jobs:
         run: |
           VERSION="${{ steps.extract_version.outputs.version }}"
           delimiter="$(openssl rand -hex 8)"
-          MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+          MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
           echo "message<<${delimiter}" >> $GITHUB_OUTPUT
           echo "$MESSAGE" >> $GITHUB_OUTPUT
           echo "${delimiter}" >> $GITHUB_OUTPUT

From 38f3925502c2971ad587fb616500b6f8b6333d03 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:10:47 +0100
Subject: [PATCH 05/26] MAINT: Remove unused paeth_predictor (#2773)

---
 pypdf/_utils.py     | 14 --------------
 tests/test_utils.py | 18 ------------------
 2 files changed, 32 deletions(-)

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 38c0d67d7a..6569707b66 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -390,20 +390,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]"
 
 
-def paeth_predictor(left: int, up: int, up_left: int) -> int:
-    p = left + up - up_left
-    dist_left = abs(p - left)
-    dist_up = abs(p - up)
-    dist_up_left = abs(p - up_left)
-
-    if dist_left <= dist_up and dist_left <= dist_up_left:
-        return left
-    elif dist_up <= dist_up_left:
-        return up
-    else:
-        return up_left
-
-
 def deprecate(msg: str, stacklevel: int = 3) -> None:
     warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 81fcf9fb47..856bedd863 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -132,24 +132,6 @@ def test_deprecate_no_replacement():
     assert warn[0].message.args[0] == error_msg
 
 
-@pytest.mark.parametrize(
-    ("left", "up", "upleft", "expected"),
-    [
-        (0, 0, 0, 0),
-        (1, 0, 0, 1),
-        (0, 1, 0, 1),
-        (0, 0, 1, 0),
-        (1, 2, 3, 1),
-        (2, 1, 3, 1),
-        (1, 3, 2, 2),
-        (3, 1, 2, 2),
-        (3, 2, 1, 3),
-    ],
-)
-def test_paeth_predictor(left, up, upleft, expected):
-    assert pypdf._utils.paeth_predictor(left, up, upleft) == expected
-
-
 @pytest.mark.parametrize(
     ("dat", "pos", "to_read", "expected", "expected_pos"),
     [

From 09f9b7ed52193bfd9e98bdd018ccaf7cbe821687 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Mon, 5 Aug 2024 16:49:29 +0100
Subject: [PATCH 06/26] MAINT: Remove unused AnnotationFlag

---
 pypdf/annotations/_non_markup_annotations.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py
index dcdb3b0ff8..6272cceee6 100644
--- a/pypdf/annotations/_non_markup_annotations.py
+++ b/pypdf/annotations/_non_markup_annotations.py
@@ -1,6 +1,5 @@
 from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
 
-from ..constants import AnnotationFlag
 from ..generic._base import (
     BooleanObject,
     NameObject,
@@ -12,8 +11,6 @@
 from ..generic._rectangle import RectangleObject
 from ._base import AnnotationDictionary
 
-DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0)
-
 
 class Link(AnnotationDictionary):
     def __init__(

From b2d72043ab5221b58138c7d06c181b8cbc88ea8e Mon Sep 17 00:00:00 2001
From: owurman <oded@acidremap.com>
Date: Mon, 5 Aug 2024 12:14:18 -0700
Subject: [PATCH 07/26] BUG: Handle Sequence as an IndirectObject when
 extracting text with layout mode (#2788)

* Handle Sequence as an IndirectObject

The spec allows an int or float to be an IndirectObject as well, but this commit does not address that theoretical possibility.

* Update pypdf/_text_extraction/_layout_mode/_font.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>

* Address PR comments

-Rename w_1 to w_next_entry
-Utilize ParseError instead of PdfReadError
-Write a test (both positive and negative)

* Handle unlikely case of IndirectObjects for float/int width elements

Also adds a comment to clarify that we don't explicitly handle the IndexError exception. Rather, we let it be raised as an IndexError.

* Yoda condition I removed

* Last commit was a bad patch, confused by non-committed changes

* Use test files from URL rather than resources

* Update tests/test_text_extraction.py

Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>

* Fix code style warnings in range() call

---------

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
---
 pypdf/_text_extraction/_layout_mode/_font.py | 26 +++++++++++++-------
 tests/test_text_extraction.py                | 17 +++++++++++++
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
index a912fddb27..40655b1b22 100644
--- a/pypdf/_text_extraction/_layout_mode/_font.py
+++ b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -1,8 +1,9 @@
 """Font constants and classes for "layout" mode text operations"""
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, Sequence, Union
+from typing import Any, Dict, Sequence, Union, cast
 
+from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
 
@@ -58,6 +59,7 @@ def __post_init__(self) -> None:
                 skip_count = 0
                 _w = d_font.get("/W", [])
                 for idx, w_entry in enumerate(_w):
+                    w_entry = w_entry.get_object()
                     if skip_count:
                         skip_count -= 1
                         continue
@@ -66,13 +68,14 @@ def __post_init__(self) -> None:
                         # warning and or use reader's "strict" to force an ex???
                         continue
                     # check for format (1): `int [int int int int ...]`
-                    if isinstance(_w[idx + 1], Sequence):
-                        start_idx, width_list = _w[idx : idx + 2]
+                    w_next_entry = _w[idx + 1].get_object()
+                    if isinstance(w_next_entry, Sequence):
+                        start_idx, width_list = w_entry, w_next_entry
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(start_idx, start_idx + len(width_list), 1),
+                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
                                     width_list,
                                 )
                                 if _cidx in ord_map
@@ -80,18 +83,23 @@ def __post_init__(self) -> None:
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    if not isinstance(_w[idx + 1], Sequence) and not isinstance(
-                        _w[idx + 2], Sequence
-                    ):
-                        start_idx, stop_idx, const_width = _w[idx : idx + 3]
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
+                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(start_idx, stop_idx + 1, 1)
+                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
                                 if _cidx in ord_map
                             }
                         )
                         skip_count = 2
+                    else:
+                        # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions
+                        # while expecting more elements). This raises an IndexError which is sufficient.
+                        raise ParseError(
+                            f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}"
+                        )  # pragma: no cover
+
         if not self.width_map and "/BaseFont" in self.font_dictionary:
             for key in STANDARD_WIDTHS:
                 if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 1ffa68a3e6..dcd4e6caeb 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -10,6 +10,7 @@
 
 from pypdf import PdfReader, mult
 from pypdf._text_extraction import set_custom_rtl
+from pypdf.errors import ParseError
 
 from . import get_data_from_url
 
@@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths():
         encoding="utf-8"
     )
     assert expected == reader.pages[0].extract_text(extraction_mode="layout")
+
+
+@pytest.mark.enable_socket()
+def test_layout_mode_indirect_sequence_font_widths():
+    # Cover the situation where the sequence for font widths is an IndirectObject
+    # ref https://github.com/py-pdf/pypdf/pull/2788
+    url = "https://github.com/user-attachments/files/16491621/2788_example.pdf"
+    name ="2788_example.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].extract_text(extraction_mode="layout") == ""
+    url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf"
+    name = "2788_example_malformed.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    with pytest.raises(ParseError) as exc:
+        reader.pages[0].extract_text(extraction_mode="layout")
+        assert str(exc.value).startswith("Invalid font width definition")

From 5abd590740a2718fc69b8477c656ce5515a0ab33 Mon Sep 17 00:00:00 2001
From: j-t-1 <120829237+j-t-1@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:14:17 +0100
Subject: [PATCH 08/26] STY: Refactor b_ (#2772)

---
 pypdf/_utils.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 6569707b66..5fecb38e7d 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -347,14 +347,11 @@ def b_(s: Union[str, bytes]) -> bytes:
         return bc[s]
     try:
         r = s.encode("latin-1")
-        if len(s) < 2:
-            bc[s] = r
-        return r
-    except Exception:
+    except UnicodeEncodeError:
         r = s.encode("utf-8")
-        if len(s) < 2:
-            bc[s] = r
-        return r
+    if len(s) < 2:
+        bc[s] = r
+    return r
 
 
 def str_(b: Any) -> str:

From 8d30c88a95113fcc37938ae45e8bb03ee5d76442 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 11 Aug 2024 20:25:34 +0200
Subject: [PATCH 09/26] ENH: compress pdf files merging identical objects

add compress_identical_objects()
discovered in #2728
closes #2794
closes #2768
---
 pypdf/_writer.py       | 233 +++++++++++++++++++++--------------------
 pypdf/generic/_base.py |   3 +
 2 files changed, 122 insertions(+), 114 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 00b9d498c0..93c474db26 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -27,11 +27,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-import collections
 import decimal
 import enum
 import hashlib
 import re
+import sys
 import uuid
 from io import BytesIO, FileIO, IOBase
 from pathlib import Path
@@ -40,7 +40,6 @@
     IO,
     Any,
     Callable,
-    Deque,
     Dict,
     Iterable,
     List,
@@ -157,12 +156,17 @@ def __init__(
         clone_from: Union[None, PdfReader, StrByteType, Path] = None,
     ) -> None:
         self._header = b"%PDF-1.3"
-        self._objects: List[PdfObject] = []
+        self._objects: List[Optional[PdfObject]] = []
         """The indirect objects in the PDF."""
 
-        self._idnum_hash: Dict[bytes, IndirectObject] = {}
-        """Maps hash values of indirect objects to their IndirectObject instances."""
+        """Maps hash values of indirect objects to the list of IndirectObjects.
+           This is used for compression
+        """
+        self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}
 
+        """list of translation already done.
+           dict[id(pdf)][(idnum, generation)]
+        """
         self._id_translated: Dict[int, Dict[int, int]] = {}
 
         # The root of our page tree node.
@@ -371,10 +375,13 @@ def get_object(
         indirect_reference: Union[int, IndirectObject],
     ) -> PdfObject:
         if isinstance(indirect_reference, int):
-            return self._objects[indirect_reference - 1]
-        if indirect_reference.pdf != self:
+            obj = self._objects[indirect_reference - 1]
+        elif indirect_reference.pdf != self:
             raise ValueError("pdf must be self")
-        return self._objects[indirect_reference.idnum - 1]
+        else:
+            obj = self._objects[indirect_reference.idnum - 1]
+        assert obj is not None
+        return obj
 
     def _replace_object(
         self,
@@ -393,6 +400,7 @@ def _replace_object(
             obj = obj.clone(self)
         self._objects[indirect_reference - 1] = obj
         obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
+        assert obj is None
         return self._objects[indirect_reference - 1]
 
     def _add_page(
@@ -1246,10 +1254,10 @@ def write_stream(self, stream: StreamType) -> None:
         if not self._root:
             self._root = self._add_object(self._root_object)
 
-        self._sweep_indirect_references(self._root)
+        # no more used : self._sweep_indirect_references(self._root)
 
-        object_positions = self._write_pdf_structure(stream)
-        xref_location = self._write_xref_table(stream, object_positions)
+        object_positions, free_objects = self._write_pdf_structure(stream)
+        xref_location = self._write_xref_table(stream, object_positions, free_objects)
         self._write_trailer(stream, xref_location)
 
     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
@@ -1282,8 +1290,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
 
         return my_file, stream
 
-    def _write_pdf_structure(self, stream: StreamType) -> List[int]:
+    def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
+        free_objects = []  # will contain list of all free entries
         stream.write(self.pdf_header.encode() + b"\n")
         stream.write(b"%\xE2\xE3\xCF\xD3\n")
 
@@ -1296,15 +1305,26 @@ def _write_pdf_structure(self, stream: StreamType) -> List[int]:
                     obj = self._encryption.encrypt_object(obj, idnum, 0)
                 obj.write_to_stream(stream)
                 stream.write(b"\nendobj\n")
-        return object_positions
-
-    def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int:
+            else:
+                object_positions.append(-1)
+                free_objects.append(i + 1)
+        free_objects.append(0)  # add 0 to loop iaw PDF spec
+        return object_positions, free_objects
+
+    def _write_xref_table(
+        self, stream: StreamType, object_positions: List[int], free_objects: List[int]
+    ) -> int:
         xref_location = stream.tell()
         stream.write(b"xref\n")
         stream.write(f"0 {len(self._objects) + 1}\n".encode())
-        stream.write(f"{0:0>10} {65535:0>5} f \n".encode())
+        stream.write(f"{free_objects[0]:0>10} {65535:0>5} f \n".encode())
+        free_idx = 1
         for offset in object_positions:
-            stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
+            if offset > 0:
+                stream.write(f"{offset:0>10} {0:0>5} n \n".encode())
+            else:
+                stream.write(f"{free_objects[free_idx]:0>10} {1:0>5} f \n".encode())
+                free_idx += 1
         return xref_location
 
     def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
@@ -1349,6 +1369,73 @@ def add_metadata(self, infos: Dict[str, Any]) -> None:
         assert isinstance(self._info, DictionaryObject)
         self._info.update(args)
 
+    def compress_identical_objects(self, verbose: Union[int, bool] = -1) -> None:
+        """
+        Parse the Pdf file and merge objects that have same harsh.
+        This will make objects common to multiple pages
+        Recommended to be used just before writing output
+
+        Args:
+            verbose: provide some progress information.
+                int : frequence of progress update; disable if negative
+                bool : True => 100 ; False = -1
+        """
+
+        def replace_in_obj(
+            obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
+        ) -> None:
+            if isinstance(obj, DictionaryObject):
+                key_val = obj.items()
+            elif isinstance(obj, ArrayObject):
+                key_val = enumerate(obj)  # type: ignore
+            else:
+                return
+            assert isinstance(obj, (DictionaryObject, ArrayObject))
+            for k, v in key_val:
+                if isinstance(v, IndirectObject) and v in crossref:
+                    obj[k] = crossref[v]
+                else:  # if isinstance(v, (DictionaryObject, ArrayObject)):
+                    replace_in_obj(v, crossref)
+
+        # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
+        self._idnum_hash = {}
+        if isinstance(verbose, int):
+            cpt_init = verbose
+        else:
+            cpt_init = 100 if verbose else -1
+        cpt = cpt_init
+        # look for similar objects
+        for idx, obj in enumerate(self._objects):
+            if obj is None:
+                continue
+            assert isinstance(obj.indirect_reference, IndirectObject)
+            h = obj.hash_value()
+            if cpt == 0:
+                print("+", end="", file=sys.stderr)  # noqa: T201
+                cpt = cpt_init
+            cpt -= 1
+            if h in self._idnum_hash:
+                self._idnum_hash[h][1].append(obj.indirect_reference)
+                self._objects[idx] = None
+            else:
+                self._idnum_hash[h] = (obj.indirect_reference, [])
+
+        # generate the dict converting others to 1st
+        cnv = {v[0]: v[1] for v in self._idnum_hash.values() if len(v[1]) > 0}
+        cnv_rev: Dict[IndirectObject, IndirectObject] = {}
+        for k, v in cnv.items():
+            cnv_rev.update(zip(v, (k,) * len(v)))
+        cpt = cpt_init
+
+        # replace reference to merged objects
+        for obj in self._objects:
+            if isinstance(obj, (DictionaryObject, ArrayObject)):
+                if cpt == 0:
+                    print(".", end="", file=sys.stderr)  # noqa: T201
+                    cpt = cpt_init
+                cpt -= 1
+                replace_in_obj(obj, cnv_rev)
+
     def _sweep_indirect_references(
         self,
         root: Union[
@@ -1363,7 +1450,7 @@ def _sweep_indirect_references(
             TextStringObject,
             NullObject,
         ],
-    ) -> None:
+    ) -> None:  # deprecated
         """
         Resolving any circular references to Page objects.
 
@@ -1379,73 +1466,15 @@ def _sweep_indirect_references(
         Args:
             root: The root of the PDF object tree to sweep.
         """
-        stack: Deque[
-            Tuple[
-                Any,
-                Optional[Any],
-                Any,
-                List[PdfObject],
-            ]
-        ] = collections.deque()
-        discovered = []
-        parent = None
-        grant_parents: List[PdfObject] = []
-        key_or_id = None
-
-        # Start from root
-        stack.append((root, parent, key_or_id, grant_parents))
-
-        while len(stack):
-            data, parent, key_or_id, grant_parents = stack.pop()
-
-            # Build stack for a processing depth-first
-            if isinstance(data, (ArrayObject, DictionaryObject)):
-                for key, value in data.items():
-                    stack.append(
-                        (
-                            value,
-                            data,
-                            key,
-                            grant_parents + [parent] if parent is not None else [],
-                        )
-                    )
-            elif isinstance(data, IndirectObject) and data.pdf != self:
-                data = self._resolve_indirect_object(data)
-
-                if str(data) not in discovered:
-                    discovered.append(str(data))
-                    stack.append((data.get_object(), None, None, []))
-
-            # Check if data has a parent and if it is a dict or
-            # an array update the value
-            if isinstance(parent, (DictionaryObject, ArrayObject)):
-                if isinstance(data, StreamObject):
-                    # a dictionary value is a stream; streams must be indirect
-                    # objects, so we need to change this value.
-                    data = self._resolve_indirect_object(self._add_object(data))
-
-                update_hashes = []
-
-                # Data changed and thus the hash value changed
-                if parent[key_or_id] != data:
-                    update_hashes = [parent.hash_value()] + [
-                        grant_parent.hash_value() for grant_parent in grant_parents
-                    ]
-                    parent[key_or_id] = data
-
-                # Update old hash value to new hash value
-                for old_hash in update_hashes:
-                    indirect_reference = self._idnum_hash.pop(old_hash, None)
-
-                    if indirect_reference is not None:
-                        indirect_reference_obj = indirect_reference.get_object()
-
-                        if indirect_reference_obj is not None:
-                            self._idnum_hash[
-                                indirect_reference_obj.hash_value()
-                            ] = indirect_reference
+        deprecate_with_replacement(
+            "_sweep_indirect_references",
+            "no replacement, please report to dev team if this warning is observed",
+            "5.0.0",
+        )
 
-    def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject:
+    def _resolve_indirect_object(
+        self, data: IndirectObject
+    ) -> IndirectObject:  # deprecated
         """
         Resolves an indirect object to an indirect object in this PDF file.
 
@@ -1470,36 +1499,12 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject:
         Raises:
             ValueError: If the input stream is closed.
         """
-        if hasattr(data.pdf, "stream") and data.pdf.stream.closed:
-            raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}")
-
-        if data.pdf == self:
-            return data
-
-        # Get real object indirect object
-        real_obj = data.pdf.get_object(data)
-
-        if real_obj is None:
-            logger_warning(
-                f"Unable to resolve [{data.__class__.__name__}: {data}], "
-                "returning NullObject instead",
-                __name__,
-            )
-            real_obj = NullObject()
-
-        hash_value = real_obj.hash_value()
-
-        # Check if object is handled
-        if hash_value in self._idnum_hash:
-            return self._idnum_hash[hash_value]
-
-        if data.pdf == self:
-            self._idnum_hash[hash_value] = IndirectObject(data.idnum, 0, self)
-        # This is new object in this pdf
-        else:
-            self._idnum_hash[hash_value] = self._add_object(real_obj)
-
-        return self._idnum_hash[hash_value]
+        deprecate_with_replacement(
+            "_resolve_indirect_object",
+            "no replacement, please report to dev team if this warning is observed",
+            "5.0.0",
+        )
+        return IndirectObject(0, 0, self)
 
     def get_reference(self, obj: PdfObject) -> IndirectObject:
         idnum = self._objects.index(obj) + 1
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index 2d606b4184..35ce956cc9 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -240,6 +240,9 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None:  # PdfReader
         self.generation = generation
         self.pdf = pdf
 
+    def __hash__(self) -> int:
+        return hash((self.idnum, self.generation, id(self.pdf)))
+
     def clone(
         self,
         pdf_dest: PdfWriterProtocol,

From 86fd7c713fca5b676098504c565dd09965e9463a Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 11 Aug 2024 20:28:08 +0200
Subject: [PATCH 10/26] pre-commit error reported

---
 pypdf/_text_extraction/_layout_mode/_font.py | 22 +++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
index 40655b1b22..1d9617d74a 100644
--- a/pypdf/_text_extraction/_layout_mode/_font.py
+++ b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -44,7 +44,7 @@ def __post_init__(self) -> None:
                 self.font_dictionary["/DescendantFonts"]
             ):
                 while isinstance(d_font, IndirectObject):
-                    d_font = d_font.get_object()  # type: ignore[assignment]
+                    d_font = d_font.get_object()
                 self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
                 ord_map = {
                     ord(_target): _surrogate
@@ -75,7 +75,11 @@ def __post_init__(self) -> None:
                             {
                                 ord_map[_cidx]: _width
                                 for _cidx, _width in zip(
-                                    range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1),
+                                    range(
+                                        cast(int, start_idx),
+                                        cast(int, start_idx) + len(width_list),
+                                        1,
+                                    ),
                                     width_list,
                                 )
                                 if _cidx in ord_map
@@ -83,12 +87,20 @@ def __post_init__(self) -> None:
                         )
                         skip_count = 1
                     # check for format (2): `int int int`
-                    elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)):
-                        start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object()
+                    elif isinstance(w_next_entry, (int, float)) and isinstance(
+                        _w[idx + 2].get_object(), (int, float)
+                    ):
+                        start_idx, stop_idx, const_width = (
+                            w_entry,
+                            w_next_entry,
+                            _w[idx + 2].get_object(),
+                        )
                         self.width_map.update(
                             {
                                 ord_map[_cidx]: const_width
-                                for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1)
+                                for _cidx in range(
+                                    cast(int, start_idx), cast(int, stop_idx + 1), 1
+                                )
                                 if _cidx in ord_map
                             }
                         )

From f91f131c581bc1015232ae68931495249738d747 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 11 Aug 2024 20:46:22 +0200
Subject: [PATCH 11/26] oups

---
 pypdf/_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 93c474db26..7c4cbbd96e 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -400,8 +400,8 @@ def _replace_object(
             obj = obj.clone(self)
         self._objects[indirect_reference - 1] = obj
         obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
-        assert obj is None
-        return self._objects[indirect_reference - 1]
+        assert isinstance(obj, IndirectObject)
+        return obj
 
     def _add_page(
         self,

From db67c0b6eb8a39f975de2decbc9a205c36bb0185 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 11 Aug 2024 21:39:43 +0200
Subject: [PATCH 12/26] doc

---
 docs/user/file-size.md | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index 0ee72e37e3..4908bda086 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -9,23 +9,24 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data are copied respecting the original format.
+For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object
 
-```python
-from pypdf import PdfReader, PdfWriter
+Also when you delete objects in a document, pypdf can not easily identify weither the object is used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within.
 
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+in order to reduce the file size a compression process:
+`writer.compress_identical_objects(remove_identical: byte = True, remove_orphans:byte = True, verbose: int = -1)`
 
-for page in reader.pages:
-    writer.add_page(page)
+`remove_identical` enables / disables compression merging identical objects
+`remove_orphans` enables / disables suppression of unused objects
+`verbose` sets the value on how many objects are processed
+the progress status (printed on stderr) of the compression is printed as follow:
+		'+' during initial loop
+		'.' when removing duplicates
+		'\*' when removing hidden objects
 
-if reader.metadata is not None:
-    writer.add_metadata(reader.metadata)
+It is recommended to apply this process just before writing to file/stream
 
-with open("smaller-new-file.pdf", "wb") as fp:
-    writer.write(fp)
-```
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

From 5bad76c2f2bc0957c95f83cd215df55a2c41d517 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 11 Aug 2024 22:38:46 +0200
Subject: [PATCH 13/26] WIP on iss2794

---
 pypdf/_writer.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 7c4cbbd96e..4d1e534015 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -62,6 +62,7 @@
     StreamType,
     _get_max_pdf_version_header,
     b_,
+    deprecate,
     deprecate_with_replacement,
     logger_warning,
 )
@@ -400,7 +401,8 @@ def _replace_object(
             obj = obj.clone(self)
         self._objects[indirect_reference - 1] = obj
         obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
-        assert isinstance(obj, IndirectObject)
+
+        assert isinstance(obj, PdfObject)
         return obj
 
     def _add_page(
@@ -1466,10 +1468,8 @@ def _sweep_indirect_references(
         Args:
             root: The root of the PDF object tree to sweep.
         """
-        deprecate_with_replacement(
-            "_sweep_indirect_references",
-            "no replacement, please report to dev team if this warning is observed",
-            "5.0.0",
+        deprecate(
+            "_sweep_indirect_references has been removed, please report to dev team if this warning is observed",
         )
 
     def _resolve_indirect_object(
@@ -1499,10 +1499,8 @@ def _resolve_indirect_object(
         Raises:
             ValueError: If the input stream is closed.
         """
-        deprecate_with_replacement(
-            "_resolve_indirect_object",
-            "no replacement, please report to dev team if this warning is observed",
-            "5.0.0",
+        deprecate(
+            "_resolve_indirect_object has been removed, please report to dev team if this warning is observed",
         )
         return IndirectObject(0, 0, self)
 

From bd4b67297c8f18d3d15016ca4caf158c8fadfed1 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 00:48:56 +0200
Subject: [PATCH 14/26] add remove_orphans  + test

---
 pypdf/_writer.py     | 50 ++++++++++++++++++++++++++++++++------------
 tests/test_writer.py | 23 ++++++++++++++++++++
 2 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 4d1e534015..4bd71482a9 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -34,6 +34,7 @@
 import sys
 import uuid
 from io import BytesIO, FileIO, IOBase
+from itertools import compress
 from pathlib import Path
 from types import TracebackType
 from typing import (
@@ -1252,11 +1253,10 @@ def write_stream(self, stream: StreamType) -> None:
                 "It may not be written to correctly.",
                 __name__,
             )
-
-        if not self._root:
-            self._root = self._add_object(self._root_object)
-
-        # no more used : self._sweep_indirect_references(self._root)
+        # no more used :
+        # if not self._root:
+        #   self._root = self._add_object(self._root_object)
+        # self._sweep_indirect_references(self._root)
 
         object_positions, free_objects = self._write_pdf_structure(stream)
         xref_location = self._write_xref_table(stream, object_positions, free_objects)
@@ -1371,16 +1371,22 @@ def add_metadata(self, infos: Dict[str, Any]) -> None:
         assert isinstance(self._info, DictionaryObject)
         self._info.update(args)
 
-    def compress_identical_objects(self, verbose: Union[int, bool] = -1) -> None:
+    def compress_identical_objects(
+        self,
+        remove_identicals: bool = True,
+        remove_orphans: bool = True,
+        verbose: int = -1,
+    ) -> None:
         """
         Parse the Pdf file and merge objects that have same harsh.
         This will make objects common to multiple pages
         Recommended to be used just before writing output
 
         Args:
-            verbose: provide some progress information.
-                int : frequence of progress update; disable if negative
-                bool : True => 100 ; False = -1
+            remove_identicals: remove of identical objects
+            remove_orphans: remove of unreferenced objects
+            verbose: frequence of progress update; <0 => disable
+
         """
 
         def replace_in_obj(
@@ -1394,9 +1400,13 @@ def replace_in_obj(
                 return
             assert isinstance(obj, (DictionaryObject, ArrayObject))
             for k, v in key_val:
-                if isinstance(v, IndirectObject) and v in crossref:
-                    obj[k] = crossref[v]
-                else:  # if isinstance(v, (DictionaryObject, ArrayObject)):
+                if isinstance(v, IndirectObject):
+                    orphans[v.idnum - 1] = False
+                    if v in crossref:
+                        obj[k] = crossref[v]
+                else:
+                    """the filtering on DictionaryObject and ArrayObject only
+                    will be performed within replace_in_obj"""
                     replace_in_obj(v, crossref)
 
         # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
@@ -1406,6 +1416,7 @@ def replace_in_obj(
         else:
             cpt_init = 100 if verbose else -1
         cpt = cpt_init
+        orphans = [True] * len(self._objects)
         # look for similar objects
         for idx, obj in enumerate(self._objects):
             if obj is None:
@@ -1416,7 +1427,7 @@ def replace_in_obj(
                 print("+", end="", file=sys.stderr)  # noqa: T201
                 cpt = cpt_init
             cpt -= 1
-            if h in self._idnum_hash:
+            if remove_identicals and h in self._idnum_hash:
                 self._idnum_hash[h][1].append(obj.indirect_reference)
                 self._objects[idx] = None
             else:
@@ -1438,6 +1449,19 @@ def replace_in_obj(
                 cpt -= 1
                 replace_in_obj(obj, cnv_rev)
 
+        # remove orphans (if applicable)
+        orphans[self.root_object.indirect_reference.idnum - 1] = False  # type: ignore
+        try:
+            orphans[self._info.indirect_reference.idnum - 1] = False  # type: ignore
+        except Exception:
+            pass
+        try:
+            orphans[self._ID.indirect_reference.idnum - 1] = False  # type: ignore
+        except Exception:
+            pass
+        for i in compress(range(len(self._objects)), orphans):
+            self._objects[i] = None
+
     def _sweep_indirect_references(
         self,
         root: Union[
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 9dfeffdd89..c06fede086 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2300,3 +2300,26 @@ def test_matrix_entry_in_field_annots():
         auto_regenerate=False,
     )
     assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"]
+
+
+@pytest.mark.enable_socket()
+def test_compress_identical_objects():
+    """Cf #2728 and #2794"""
+    url = "https://github.com/user-attachments/files/16575458/tt2.pdf"
+    name = "iss2794.pdf"
+    in_bytes = BytesIO(get_data_from_url(url, name=name))
+    writer = PdfWriter(in_bytes)
+    writer.compress_identical_objects(remove_orphans=False, verbose=100)
+    out1 = BytesIO()
+    writer.write(out1)
+    assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue())
+    writer.remove_page(
+        1
+    )  # page0 contains fields which keep reference to the deleted page
+    out2 = BytesIO()
+    writer.write(out2)
+    assert len(out1.getvalue()) - 100 < len(out2.getvalue())
+    writer.compress_identical_objects(remove_identicals=False, verbose=100)
+    out3 = BytesIO()
+    writer.write(out3)
+    assert len(out2.getvalue()) > len(out3.getvalue())

From 380233bfac83ad565e72756a889007c96cc37b7d Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 01:41:27 +0200
Subject: [PATCH 15/26] doc + coverage

---
 docs/user/file-size.md | 11 ++++++-----
 pypdf/_writer.py       |  6 +++---
 tests/test_writer.py   |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index 4908bda086..96ce6fab56 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -12,18 +12,19 @@ be embedded once and referenced twice.
 When adding data to a PdfWriter, the data are copied respecting the original format.
 For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object
 
-Also when you delete objects in a document, pypdf can not easily identify weither the object is used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within.
+Also when you delete objects in a document, pypdf can not easily identify weither the objects are used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within.
 
 in order to reduce the file size a compression process:
-`writer.compress_identical_objects(remove_identical: byte = True, remove_orphans:byte = True, verbose: int = -1)`
+`writer.compress_identical_objects(remove_identicals = True, remove_orphans= True, verbose = -1)`
 
 `remove_identical` enables / disables compression merging identical objects
+
 `remove_orphans` enables / disables suppression of unused objects
+
 `verbose` sets the value on how many objects are processed
 the progress status (printed on stderr) of the compression is printed as follow:
-		'+' during initial loop
-		'.' when removing duplicates
-		'\*' when removing hidden objects
+* '+' during initial loop
+* '.' when replacing duplicates
 
 It is recommended to apply this process just before writing to file/stream
 
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 4bd71482a9..65a1d96df9 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1414,7 +1414,7 @@ def replace_in_obj(
         if isinstance(verbose, int):
             cpt_init = verbose
         else:
-            cpt_init = 100 if verbose else -1
+            cpt_init = -1
         cpt = cpt_init
         orphans = [True] * len(self._objects)
         # look for similar objects
@@ -1453,8 +1453,8 @@ def replace_in_obj(
         orphans[self.root_object.indirect_reference.idnum - 1] = False  # type: ignore
         try:
             orphans[self._info.indirect_reference.idnum - 1] = False  # type: ignore
-        except Exception:
-            pass
+        except Exception:  # pragma: no cover
+            pass  # pragma: no cover
         try:
             orphans[self._ID.indirect_reference.idnum - 1] = False  # type: ignore
         except Exception:
diff --git a/tests/test_writer.py b/tests/test_writer.py
index c06fede086..9ecaa25ca8 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2319,7 +2319,7 @@ def test_compress_identical_objects():
     out2 = BytesIO()
     writer.write(out2)
     assert len(out1.getvalue()) - 100 < len(out2.getvalue())
-    writer.compress_identical_objects(remove_identicals=False, verbose=100)
+    writer.compress_identical_objects(remove_identicals=False, verbose="fake")
     out3 = BytesIO()
     writer.write(out3)
     assert len(out2.getvalue()) > len(out3.getvalue())

From 219eb13f7eb9c2cd9519e9a69d639250853bd823 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 10:52:40 +0200
Subject: [PATCH 16/26] MAINT: Drop Python 3.7 support (#2793)

---
 .github/workflows/github-ci.yaml                   | 14 +++++++-------
 pypdf/_page.py                                     |  8 +-------
 pypdf/_protocols.py                                |  8 +-------
 .../_layout_mode/_fixed_width_page.py              |  8 +-------
 pypdf/_xobj_image_helpers.py                       | 13 ++++---------
 pypdf/types.py                                     |  8 +-------
 6 files changed, 15 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
index 1eb3d9bd03..d5d9bb4d4d 100644
--- a/.github/workflows/github-ci.yaml
+++ b/.github/workflows/github-ci.yaml
@@ -57,12 +57,12 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
         use-crypto-lib: ["cryptography"]
         include:
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "pycryptodome"
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "none"
     steps:
     - name: Update APT packages
@@ -83,7 +83,7 @@ jobs:
         key: cache-downloaded-files
     - name: Setup Python
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
@@ -102,7 +102,7 @@ jobs:
     - name: Install requirements (Python 3)
       run: |
         pip install -r requirements/ci.txt
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
     - name: Install requirements (Python 3.11+)
       run: |
         pip install -r requirements/ci-3.11.txt
@@ -215,8 +215,8 @@ jobs:
       - name: Check Number of Downloaded Files
         run: |
           downloaded_files_count=$(find \.coverage* -type f | wc -l)
-          if [ $downloaded_files_count -eq 9 ]; then
-            echo "The expected number of files (9) were downloaded."
+          if [ $downloaded_files_count -eq 8 ]; then
+            echo "The expected number of files (8) were downloaded."
           else
             echo "ERROR: Expected 8 files, but found $downloaded_files_count files."
             exit 1
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 63038d9d07..ee1dc7f602 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -28,7 +28,6 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
-import sys
 from decimal import Decimal
 from pathlib import Path
 from typing import (
@@ -38,6 +37,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -85,12 +85,6 @@
     StreamObject,
 )
 
-if sys.version_info >= (3, 8):
-    from typing import Literal
-else:
-    from typing_extensions import Literal
-
-
 MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
 
 
diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
index 9f413660bb..b5fa14879c 100644
--- a/pypdf/_protocols.py
+++ b/pypdf/_protocols.py
@@ -2,13 +2,7 @@
 
 from abc import abstractmethod
 from pathlib import Path
-from typing import IO, Any, Dict, List, Optional, Tuple, Union
-
-try:
-    # Python 3.8+: https://peps.python.org/pep-0586
-    from typing import Protocol
-except ImportError:
-    from typing_extensions import Protocol  # type: ignore[assignment]
+from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
 
 from ._utils import StrByteType, StreamType
 
diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
index 1be500959c..e7af1b2340 100644
--- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
+++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -1,10 +1,9 @@
 """Extract PDF text preserving the layout of the source PDF"""
 
-import sys
 from itertools import groupby
 from math import ceil
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict
 
 from ..._utils import logger_warning
 from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
@@ -12,11 +11,6 @@
 from ._text_state_manager import TextStateManager
 from ._text_state_params import TextStateParams
 
-if sys.version_info >= (3, 8):
-    from typing import Literal, TypedDict
-else:
-    from typing_extensions import Literal, TypedDict
-
 
 class BTGroup(TypedDict):
     """
diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py
index 5ae8894fa3..7a3f40d95c 100644
--- a/pypdf/_xobj_image_helpers.py
+++ b/pypdf/_xobj_image_helpers.py
@@ -2,7 +2,7 @@
 
 import sys
 from io import BytesIO
-from typing import Any, List, Tuple, Union, cast
+from typing import Any, List, Literal, Tuple, Union, cast
 
 from ._utils import check_if_whitespace_only, logger_warning
 from .constants import ColorSpaces
@@ -15,13 +15,6 @@
     NullObject,
 )
 
-if sys.version_info[:2] >= (3, 8):
-    from typing import Literal
-else:
-    # PEP 586 introduced typing.Literal with Python 3.8
-    # For older Python versions, the backport typing_extensions is necessary:
-    from typing_extensions import Literal
-
 if sys.version_info[:2] >= (3, 10):
     from typing import TypeAlias
 else:
@@ -150,7 +143,9 @@ def _extended_image_frombytes(
         nb_pix = size[0] * size[1]
         data_length = len(data)
         if data_length == 0:
-            raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc
+            raise EmptyImageDataError(
+                "Data is 0 bytes, cannot process an image from empty data."
+            ) from exc
         if data_length % nb_pix != 0:
             raise exc
         k = nb_pix * len(mode) / data_length
diff --git a/pypdf/types.py b/pypdf/types.py
index b8fbab92cf..e383dc7b1f 100644
--- a/pypdf/types.py
+++ b/pypdf/types.py
@@ -1,13 +1,7 @@
 """Helpers for working with PDF types."""
 
 import sys
-from typing import List, Union
-
-if sys.version_info[:2] >= (3, 8):
-    # Python 3.8+: https://peps.python.org/pep-0586
-    from typing import Literal
-else:
-    from typing_extensions import Literal
+from typing import List, Literal, Union
 
 if sys.version_info[:2] >= (3, 10):
     # Python 3.10+: https://www.python.org/dev/peps/pep-0484

From 46c89dd8e1e8641a49624f3fbc1865f9c4b41374 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 10:57:25 +0200
Subject: [PATCH 17/26] MAINT: Remove b_ and str_ (#2792)

Closes #2726. Closes #2791.
---
 pypdf/_cmap.py                                |  30 +++----
 pypdf/_doc_common.py                          |   3 +-
 pypdf/_encryption.py                          |   6 +-
 pypdf/_merger.py                              |  10 ++-
 pypdf/_page.py                                |  16 ++--
 pypdf/_reader.py                              |   5 +-
 pypdf/_utils.py                               |  34 ++------
 pypdf/_writer.py                              |   6 +-
 pypdf/filters.py                              |  53 ++++++++----
 pypdf/generic/_base.py                        |  39 +++++----
 pypdf/generic/_data_structures.py             |  35 ++++----
 pypdf/generic/_utils.py                       |  77 ++++++++++--------
 ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217096 -> 217093 bytes
 tests/test_cmap.py                            |  18 +++-
 tests/test_page.py                            |  13 ++-
 tests/test_utils.py                           |  16 ----
 tests/test_workflows.py                       |   4 +-
 tests/test_writer.py                          |   2 +-
 18 files changed, 193 insertions(+), 174 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 9a2d10a611..d635724d25 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -3,11 +3,10 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    IndirectObject,
     NullObject,
     StreamObject,
 )
@@ -258,7 +257,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     tu = ft["/ToUnicode"]
     cm: bytes
     if isinstance(tu, StreamObject):
-        cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
     elif isinstance(tu, str) and tu.startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
@@ -448,34 +447,27 @@ def compute_space_width(
             en: int = cast(int, ft["/LastChar"])
             if st > space_code or en < space_code:
                 raise Exception("Not in range")
-            if w[space_code - st] == 0:
+            if w[space_code - st].get_object() == 0:
                 raise Exception("null width")
-            sp_width = w[space_code - st]
+            sp_width = w[space_code - st].get_object()
         except Exception:
             if "/FontDescriptor" in ft and "/MissingWidth" in cast(
                 DictionaryObject, ft["/FontDescriptor"]
             ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
+                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
             else:
                 # will consider width of char as avg(width)/2
                 m = 0
                 cpt = 0
-                for x in w:
-                    if x > 0:
-                        m += x
+                for xx in w:
+                    xx = xx.get_object()
+                    if xx > 0:
+                        m += xx
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
 
-    if isinstance(sp_width, IndirectObject):
-        # According to
-        # 'Table 122 - Entries common to all font descriptors (continued)'
-        # the MissingWidth should be a number, but according to #2286 it can
-        # be an indirect object
-        obj = sp_width.get_object()
-        if obj is None or isinstance(obj, NullObject):
-            return 0.0
-        return obj  # type: ignore
-
+    if sp_width is None or isinstance(sp_width, NullObject):
+        sp_width = 0.0
     return sp_width
 
 
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index d4c5c43c3c..ffbdb7882e 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -49,7 +49,6 @@
 from ._page import PageObject, _VirtualList
 from ._page_labels import index2label as page_index2page_label
 from ._utils import (
-    b_,
     deprecate_with_replacement,
     logger_warning,
     parse_iso8824_date,
@@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
                 if isinstance(f, IndirectObject):
                     field = cast(Optional[EncodedStreamObject], f.get_object())
                     if field:
-                        es = zlib.decompress(b_(field._data))
+                        es = zlib.decompress(field._data)
                         retval[tag] = es
         return retval
 
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
index 5ddd8d0efe..e5cdd9324e 100644
--- a/pypdf/_encryption.py
+++ b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
     rc4_encrypt,
 )
 
-from ._utils import b_, logger_warning
+from ._utils import logger_warning
 from .generic import (
     ArrayObject,
     ByteStringObject,
@@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2.set_data(self.stm_crypt.encrypt(b_(obj._data)))
+            obj2.set_data(self.stm_crypt.encrypt(obj._data))
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj2[key] = self.encrypt_object(value)
             obj = obj2
@@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
             data = self.str_crypt.decrypt(obj.original_bytes)
             obj = create_string_object(data)
         elif isinstance(obj, StreamObject):
-            obj._data = self.stm_crypt.decrypt(b_(obj._data))
+            obj._data = self.stm_crypt.decrypt(obj._data)
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj[key] = self.decrypt_object(value)
         elif isinstance(obj, DictionaryObject):
diff --git a/pypdf/_merger.py b/pypdf/_merger.py
index 7176a1adf7..a52a354e38 100644
--- a/pypdf/_merger.py
+++ b/pypdf/_merger.py
@@ -46,7 +46,6 @@
 from ._utils import (
     StrByteType,
     deprecate_with_replacement,
-    str_,
 )
 from ._writer import PdfWriter
 from .constants import GoToActionArguments, TypArguments, TypFitArguments
@@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
         self.id = id
 
 
+# transfered from _utils : as this function is only required here
+# and merger will be soon deprecated
+def str_(b: Any) -> str:  # pragma: no cover
+    if isinstance(b, bytes):
+        return b.decode("latin-1")
+    else:
+        return str(b)  # will return b.__str__() if defined
+
+
 class PdfMerger:
     """
     Use :class:`PdfWriter` instead.
diff --git a/pypdf/_page.py b/pypdf/_page.py
index ee1dc7f602..48cdeb149f 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -846,7 +846,7 @@ def _add_transformation_matrix(
                     FloatObject(e),
                     FloatObject(f),
                 ],
-                " cm",
+                b"cm",
             ],
         )
         return contents
@@ -864,7 +864,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]:
             if isinstance(obj, list):
                 return b"".join(x.get_object().get_data() for x in obj)
             else:
-                return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+                return cast(EncodedStreamObject, obj).get_data()
         else:
             return None
 
@@ -1057,11 +1057,11 @@ def _merge_page(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(
@@ -1195,11 +1195,11 @@ def _merge_page_writer(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index aeababa7b7..7c084107c5 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -51,7 +51,6 @@
 from ._utils import (
     StrByteType,
     StreamType,
-    b_,
     logger_warning,
     read_non_whitespace,
     read_previous_line,
@@ -328,7 +327,7 @@ def _get_object_from_stream(
         assert cast(str, obj_stm["/Type"]) == "/ObjStm"
         # /N is the number of indirect objects in the stream
         assert idx < obj_stm["/N"]
-        stream_data = BytesIO(b_(obj_stm.get_data()))
+        stream_data = BytesIO(obj_stm.get_data())
         for i in range(obj_stm["/N"]):  # type: ignore
             read_non_whitespace(stream_data)
             stream_data.seek(-1, 1)
@@ -932,7 +931,7 @@ def _read_pdf15_xref_stream(
         xrefstream = cast(ContentStream, read_object(stream, self))
         assert cast(str, xrefstream["/Type"]) == "/XRef"
         self.cache_indirect_object(generation, idnum, xrefstream)
-        stream_data = BytesIO(b_(xrefstream.get_data()))
+        stream_data = BytesIO(xrefstream.get_data())
         # Index pairs specify the subsections in the dictionary. If
         # none create one subsection that spans everything.
         idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
index 5fecb38e7d..94d45cf6d7 100644
--- a/pypdf/_utils.py
+++ b/pypdf/_utils.py
@@ -336,31 +336,6 @@ def mark_location(stream: StreamType) -> None:
     stream.seek(-radius, 1)
 
 
-B_CACHE: Dict[str, bytes] = {}
-
-
-def b_(s: Union[str, bytes]) -> bytes:
-    if isinstance(s, bytes):
-        return s
-    bc = B_CACHE
-    if s in bc:
-        return bc[s]
-    try:
-        r = s.encode("latin-1")
-    except UnicodeEncodeError:
-        r = s.encode("utf-8")
-    if len(s) < 2:
-        bc[s] = r
-    return r
-
-
-def str_(b: Any) -> str:
-    if isinstance(b, bytes):
-        return b.decode("latin-1")
-    else:
-        return str(b)  # will return b.__str__() if defined
-
-
 @overload
 def ord_(b: str) -> int:
     ...
@@ -397,12 +372,17 @@ def deprecation(msg: str) -> None:
 
 def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
     """Raise an exception that a feature will be removed, but has a replacement."""
-    deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4)
+    deprecate(
+        f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
+        4,
+    )
 
 
 def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
     """Raise an exception that a feature was already removed, but has a replacement."""
-    deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.")
+    deprecation(
+        f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
+    )
 
 
 def deprecate_no_replacement(name: str, removed_in: str) -> None:
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 00b9d498c0..d73c00e3d4 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -62,7 +62,6 @@
     StrByteType,
     StreamType,
     _get_max_pdf_version_header,
-    b_,
     deprecate_with_replacement,
     logger_warning,
 )
@@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
         # Hello world!
         # endstream
         # endobj
-
+        if isinstance(data, str):
+            data = data.encode("latin-1")
         file_entry = DecodedStreamObject()
-        file_entry.set_data(b_(data))
+        file_entry.set_data(data)
         file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
 
         # The Filespec entry
diff --git a/pypdf/filters.py b/pypdf/filters.py
index 137e3603a3..43730cc8e9 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -43,7 +43,7 @@
 
 from ._utils import (
     WHITESPACES_AS_BYTES,
-    b_,
+    deprecate,
     deprecate_with_replacement,
     deprecation_no_replacement,
     logger_warning,
@@ -376,20 +376,18 @@ class LZWDecode:
     """
     Taken from:
 
-    http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-
-    Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
+    http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
     """
 
     class Decoder:
+        STOP = 257
+        CLEARDICT = 256
+
         def __init__(self, data: bytes) -> None:
-            self.STOP = 257
-            self.CLEARDICT = 256
             self.data = data
             self.bytepos = 0
             self.bitpos = 0
-            self.dict = [""] * 4096
-            for i in range(256):
-                self.dict[i] = chr(i)
+            self.dict = [struct.pack("B", i) for i in range(256)] + [b""] * (4096 - 256)
             self.reset_dict()
 
         def reset_dict(self) -> None:
@@ -416,7 +414,7 @@ def next_code(self) -> int:
                     self.bytepos = self.bytepos + 1
             return value
 
-        def decode(self) -> str:
+        def decode(self) -> bytes:
             """
             TIFF 6.0 specification explains in sufficient details the steps to
             implement the LZW encode() and decode() algorithms.
@@ -429,7 +427,7 @@ def decode(self) -> str:
               PdfReadError: If the stop code is missing
             """
             cW = self.CLEARDICT
-            baos = ""
+            baos = b""
             while True:
                 pW = cW
                 cW = self.next_code()
@@ -444,11 +442,11 @@ def decode(self) -> str:
                 else:
                     if cW < self.dictlen:
                         baos += self.dict[cW]
-                        p = self.dict[pW] + self.dict[cW][0]
+                        p = self.dict[pW] + self.dict[cW][0:1]
                         self.dict[self.dictlen] = p
                         self.dictlen += 1
                     else:
-                        p = self.dict[pW] + self.dict[pW][0]
+                        p = self.dict[pW] + self.dict[pW][0:1]
                         baos += p
                         self.dict[self.dictlen] = p
                         self.dictlen += 1
@@ -460,11 +458,11 @@ def decode(self) -> str:
             return baos
 
     @staticmethod
-    def decode(
+    def _decodeb(
         data: bytes,
         decode_parms: Optional[DictionaryObject] = None,
         **kwargs: Any,
-    ) -> str:
+    ) -> bytes:
         """
         Decode an LZW encoded data stream.
 
@@ -476,9 +474,28 @@ def decode(
           decoded data.
         """
         # decode_parms is unused here
-
         return LZWDecode.Decoder(data).decode()
 
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> str:  # deprecated
+        """
+        Decode an LZW encoded data stream.
+
+        Args:
+          data: ``bytes`` or ``str`` text to decode.
+          decode_parms: a dictionary of parameter values.
+
+        Returns:
+          decoded data.
+        """
+        # decode_parms is unused here
+        deprecate("LZWDecode.decode will return bytes instead of str in pypdf 6.0.0")
+        return LZWDecode.Decoder(data).decode().decode("latin-1")
+
 
 class ASCII85Decode:
     """Decodes string ASCII85-encoded data into a byte format."""
@@ -651,7 +668,7 @@ def decode(
         return tiff_header + data
 
 
-def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
+def decode_stream_data(stream: Any) -> bytes:  # utils.StreamObject
     """
     Decode the stream data based on the specified filters.
 
@@ -678,7 +695,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
     decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
     if not isinstance(decodparms, (list, tuple)):
         decodparms = (decodparms,)
-    data: bytes = b_(stream._data)
+    data: bytes = stream._data
     # If there is not data to decode we should not try to decode the data.
     if data:
         for filter_type, params in zip(filters, decodparms):
@@ -691,7 +708,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
             elif filter_type in (FT.RUN_LENGTH_DECODE, FTA.RL):
                 data = RunLengthDecode.decode(data)
             elif filter_type in (FT.LZW_DECODE, FTA.LZW):
-                data = LZWDecode.decode(data, params)  # type: ignore
+                data = LZWDecode._decodeb(data, params)
             elif filter_type in (FT.ASCII_85_DECODE, FTA.A85):
                 data = ASCII85Decode.decode(data)
             elif filter_type == FT.DCT_DECODE:
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index 2d606b4184..309d389cc8 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -30,18 +30,17 @@
 import re
 from binascii import unhexlify
 from math import log10
+from struct import iter_unpack
 from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast
 
 from .._codecs import _pdfdoc_encoding_rev
 from .._protocols import PdfObjectProtocol, PdfWriterProtocol
 from .._utils import (
     StreamType,
-    b_,
     deprecate_no_replacement,
     logger_warning,
     read_non_whitespace,
     read_until_regex,
-    str_,
 )
 from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
 
@@ -308,6 +307,10 @@ def __getitem__(self, key: Any) -> Any:
         # items should be extracted from pointed Object
         return self._get_object_with_check()[key]  # type: ignore
 
+    def __float__(self) -> str:
+        # in this case we are looking for the pointed data
+        return self.get_object().__float__()  # type: ignore
+
     def __str__(self) -> str:
         # in this case we are looking for the pointed data
         return self.get_object().__str__()
@@ -369,10 +372,10 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject":  # PdfRe
 
 class FloatObject(float, PdfObject):
     def __new__(
-        cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None
+        cls, value: Any = "0.0", context: Optional[Any] = None
     ) -> "FloatObject":
         try:
-            value = float(str_(value))
+            value = float(value)
             return float.__new__(cls, value)
         except Exception as e:
             # If this isn't a valid decimal (happens in malformed PDFs)
@@ -599,15 +602,16 @@ def write_to_stream(
             )
         bytearr = self.get_encoded_bytes()
         stream.write(b"(")
-        for c in bytearr:
-            if not chr(c).isalnum() and c != b" ":
+        for c_ in iter_unpack("c", bytearr):
+            c = cast(bytes, c_[0])
+            if not c.isalnum() and c != b" ":
                 # This:
                 #   stream.write(rf"\{c:0>3o}".encode())
                 # gives
                 #   https://github.com/davidhalter/parso/issues/207
-                stream.write(("\\%03o" % c).encode())
+                stream.write(b"\\%03o" % ord(c))
             else:
-                stream.write(b_(chr(c)))
+                stream.write(c)
         stream.write(b")")
 
 
@@ -710,12 +714,13 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject":  # PdfReader
 
 
 def encode_pdfdocencoding(unicode_string: str) -> bytes:
-    retval = bytearray()
-    for c in unicode_string:
-        try:
-            retval += b_(chr(_pdfdoc_encoding_rev[c]))
-        except KeyError:
-            raise UnicodeEncodeError(
-                "pdfdocencoding", c, -1, -1, "does not exist in translation table"
-            )
-    return bytes(retval)
+    try:
+        return bytes([_pdfdoc_encoding_rev[k] for k in unicode_string])
+    except KeyError:
+        raise UnicodeEncodeError(
+            "pdfdocencoding",
+            unicode_string,
+            -1,
+            -1,
+            "does not exist in translation table",
+        )
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 87d6886742..399836be5f 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -52,7 +52,6 @@
 from .._utils import (
     WHITESPACES,
     StreamType,
-    b_,
     deprecate_no_replacement,
     deprecate_with_replacement,
     logger_warning,
@@ -843,7 +842,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None:
 
 class StreamObject(DictionaryObject):
     def __init__(self) -> None:
-        self._data: Union[bytes, str] = b""
+        self._data: bytes = b""
         self.decoded_self: Optional[DecodedStreamObject] = None
 
     def _clone(
@@ -877,7 +876,7 @@ def _clone(
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited)
 
-    def get_data(self) -> Union[bytes, str]:
+    def get_data(self) -> bytes:
         return self._data
 
     def set_data(self, data: bytes) -> None:
@@ -885,7 +884,7 @@ def set_data(self, data: bytes) -> None:
 
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
-        data += b_(self._data)
+        data += self._data
         return data
 
     def write_to_stream(
@@ -955,7 +954,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
         retval[NameObject(SA.FILTER)] = f
         if params is not None:
             retval[NameObject(SA.DECODE_PARMS)] = params
-        retval._data = FlateDecode.encode(b_(self._data), level)
+        retval._data = FlateDecode.encode(self._data, level)
         return retval
 
     def decode_as_image(self) -> Any:
@@ -993,7 +992,7 @@ def __init__(self) -> None:
         self.decoded_self: Optional[DecodedStreamObject] = None
 
     # This overrides the parent method:
-    def get_data(self) -> Union[bytes, str]:
+    def get_data(self) -> bytes:
         from ..filters import decode_stream_data
 
         if self.decoded_self is not None:
@@ -1003,7 +1002,7 @@ def get_data(self) -> Union[bytes, str]:
             # create decoded object
             decoded = DecodedStreamObject()
 
-            decoded.set_data(b_(decode_stream_data(self)))
+            decoded.set_data(decode_stream_data(self))
             for key, value in list(self.items()):
                 if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS):
                     decoded[key] = value
@@ -1058,7 +1057,7 @@ def __init__(
         # The inner list has two elements:
         #  Element 0: List
         #  Element 1: str
-        self._operations: List[Tuple[Any, Any]] = []
+        self._operations: List[Tuple[Any, bytes]] = []
 
         # stream may be a StreamObject or an ArrayObject containing
         # multiple StreamObjects to be cat'd together.
@@ -1069,14 +1068,14 @@ def __init__(
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
-                    data += b_(s.get_object().get_data())
+                    data += s.get_object().get_data()
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
                 super().set_data(bytes(data))
             else:
                 stream_data = stream.get_data()
                 assert stream_data is not None
-                super().set_data(b_(stream_data))
+                super().set_data(stream_data)
             self.forced_encoding = forced_encoding
 
     def clone(
@@ -1132,7 +1131,7 @@ def _clone(
             ignore_fields:
         """
         src_cs = cast("ContentStream", src)
-        super().set_data(b_(src_cs._data))
+        super().set_data(src_cs._data)
         self.pdf = pdf_dest
         self._operations = list(src_cs._operations)
         self.forced_encoding = src_cs.forced_encoding
@@ -1249,10 +1248,10 @@ def get_data(self) -> bytes:
                     for op in operands:
                         op.write_to_stream(new_data)
                         new_data.write(b" ")
-                    new_data.write(b_(operator))
+                    new_data.write(operator)
                 new_data.write(b"\n")
             self._data = new_data.getvalue()
-        return b_(self._data)
+        return self._data
 
     # This overrides the parent method:
     def set_data(self, data: bytes) -> None:
@@ -1262,21 +1261,21 @@ def set_data(self, data: bytes) -> None:
     @property
     def operations(self) -> List[Tuple[Any, Any]]:
         if not self._operations and self._data:
-            self._parse_content_stream(BytesIO(b_(self._data)))
+            self._parse_content_stream(BytesIO(self._data))
             self._data = b""
         return self._operations
 
     @operations.setter
-    def operations(self, operations: List[Tuple[Any, Any]]) -> None:
+    def operations(self, operations: List[Tuple[Any, bytes]]) -> None:
         self._operations = operations
         self._data = b""
 
     def isolate_graphics_state(self) -> None:
         if self._operations:
-            self._operations.insert(0, ([], "q"))
-            self._operations.append(([], "Q"))
+            self._operations.insert(0, ([], b"q"))
+            self._operations.append(([], b"Q"))
         elif self._data:
-            self._data = b"q\n" + b_(self._data) + b"\nQ\n"
+            self._data = b"q\n" + self._data + b"\nQ\n"
 
     # This overrides the parent method:
     def write_to_stream(
diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
index fdcdc33399..b5ac6632ac 100644
--- a/pypdf/generic/_utils.py
+++ b/pypdf/generic/_utils.py
@@ -2,7 +2,7 @@
 from typing import Dict, List, Tuple, Union
 
 from .._codecs import _pdfdoc_encoding
-from .._utils import StreamType, b_, logger_warning, read_non_whitespace
+from .._utils import StreamType, logger_warning, read_non_whitespace
 from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
 from ._base import ByteStringObject, TextStringObject
 
@@ -16,7 +16,7 @@ def read_hex_string_from_stream(
     forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
 ) -> Union["TextStringObject", "ByteStringObject"]:
     stream.read(1)
-    txt = ""
+    arr = []
     x = b""
     while True:
         tok = read_non_whitespace(stream)
@@ -26,13 +26,37 @@ def read_hex_string_from_stream(
             break
         x += tok
         if len(x) == 2:
-            txt += chr(int(x, base=16))
+            arr.append(int(x, base=16))
             x = b""
     if len(x) == 1:
         x += b"0"
-    if len(x) == 2:
-        txt += chr(int(x, base=16))
-    return create_string_object(b_(txt), forced_encoding)
+    if x != b"":
+        arr.append(int(x, base=16))
+    return create_string_object(bytes(arr), forced_encoding)
+
+
+__ESPACE_DICT__ = {
+    b"n": ord(b"\n"),
+    b"r": ord(b"\r"),
+    b"t": ord(b"\t"),
+    b"b": ord(b"\b"),
+    b"f": ord(b"\f"),
+    b"(": ord(b"("),
+    b")": ord(b")"),
+    b"/": ord(b"/"),
+    b"\\": ord(b"\\"),
+    b" ": ord(b" "),
+    b"%": ord(b"%"),
+    b"<": ord(b"<"),
+    b">": ord(b">"),
+    b"[": ord(b"["),
+    b"]": ord(b"]"),
+    b"#": ord(b"#"),
+    b"_": ord(b"_"),
+    b"&": ord(b"&"),
+    b"$": ord(b"$"),
+}
+__BACKSLASH_CODE__ = 92
 
 
 def read_string_from_stream(
@@ -54,30 +78,9 @@ def read_string_from_stream(
                 break
         elif tok == b"\\":
             tok = stream.read(1)
-            escape_dict = {
-                b"n": b"\n",
-                b"r": b"\r",
-                b"t": b"\t",
-                b"b": b"\b",
-                b"f": b"\f",
-                b"c": rb"\c",
-                b"(": b"(",
-                b")": b")",
-                b"/": b"/",
-                b"\\": b"\\",
-                b" ": b" ",
-                b"%": b"%",
-                b"<": b"<",
-                b">": b">",
-                b"[": b"[",
-                b"]": b"]",
-                b"#": b"#",
-                b"_": b"_",
-                b"&": b"&",
-                b"$": b"$",
-            }
             try:
-                tok = escape_dict[tok]
+                txt.append(__ESPACE_DICT__[tok])
+                continue
             except KeyError:
                 if b"0" <= tok <= b"7":
                     # "The number ddd may consist of one, two, or three
@@ -85,6 +88,7 @@ def read_string_from_stream(
                     # Three octal digits shall be used, with leading zeros
                     # as needed, if the next character of the string is also
                     # a digit." (PDF reference 7.3.4.2, p 16)
+                    sav = stream.tell() - 1
                     for _ in range(2):
                         ntok = stream.read(1)
                         if b"0" <= ntok <= b"7":
@@ -92,7 +96,13 @@ def read_string_from_stream(
                         else:
                             stream.seek(-1, 1)  # ntok has to be analyzed
                             break
-                    tok = b_(chr(int(tok, base=8)))
+                    i = int(tok, base=8)
+                    if i > 255:
+                        txt.append(__BACKSLASH_CODE__)
+                        stream.seek(sav)
+                    else:
+                        txt.append(i)
+                    continue
                 elif tok in b"\n\r":
                     # This case is  hit when a backslash followed by a line
                     # break occurs. If it's a multi-char EOL, consume the
@@ -102,12 +112,13 @@ def read_string_from_stream(
                         stream.seek(-1, 1)
                     # Then don't add anything to the actual string, since this
                     # line break was escaped:
-                    tok = b""
+                    continue
                 else:
                     msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}"
                     logger_warning(msg, __name__)
-        txt.append(tok)
-    return create_string_object(b"".join(txt), forced_encoding)
+                    txt.append(__BACKSLASH_CODE__)
+        txt.append(ord(tok))
+    return create_string_object(bytes(txt), forced_encoding)
 
 
 def create_string_object(
diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf
index 0e9633ac16c138eeaa90d3cf13e9f7cd6e2c006d..a53f28f0be432c38a1fff33672a2170eeb5f553f 100644
GIT binary patch
delta 1135
zcmZuwJ5B>Z3>Bn7L8-XFHtGEtkL~mzB#PXGI^7lsK}(f3N1%gya3N-+5Rx~UZB|eA
z_<cO@yT$8n@mV)l!^>*(w%S~Mtv1*EGs-~I8J>sv{zoZ`BgRCDn2^;DWAA#+qhOsy
z9_nuY2iNV#)9h}noP_mNYj;O2#kxAbfL+j&#EQMi$qgIN%*aCBkgS9v3tOTYrFns*
zi?DXRqIyy{%_l0Z{g4pthD0$;#wiU+yHv)c!>vmue00SNBRm4^N)hR?Vv~jK{H4mw
ziIFg?VZ<yRz8BZ@ADbme@j^*L)~rq4iG>q^h6P&9s+GXO#zoHdee$2JJpxz|Hl_r{
zY(@&UHcToGu}TFAn%96`U{@PYmtn8OwFX4O-WCKhW)JHKw%|G;shLP7tsbKDqy5qZ
a<FglL!lx)aecSK#?D0nD%jNz0Vg3t(%-ppA

delta 1030
zcmZuvJxYT?6ePw%0ujUuq)B1r`+xfiOGWS}*n0-+bU{S4wc0k`Knt<<44%N7xKSeH
zty_J(Z+2$h%-%1@kIV6k&9|>X;Lhg{J|DdMd<#DhbONh0xtWZ9J{UqF&k&I6)7fzU
zDDp4D(!te>XPgE={aR6UiiW`%ZfqTeiIj@xjiPk$e@q9UWn5rLy=@eWjv@^6B7z!u
zVL|uoObDgBOUxb4d9AVd3tS3uCsM>HdG}UKHtl4X<1DUS`o&!~`3?a}@xUY!MBL9r
zrG8xWxg)aX>X2IbX&<BlEOqa)<(@J@ZY%V&lJi7ngG|dx`K_WtNU9wb+XMdHba$OQ
zssd|$U3tp9Y9QH@x<%?3oi%sdW)_=K1G+e;>d=ezp1ic;>(#79o%KUaWwR|p2uxbk
YRyD8Bjbb46Z8kjJvC(vTJUbbE19m{nQvd(}

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
index 9dcfb252d5..69f2931f67 100644
--- a/tests/test_cmap.py
+++ b/tests/test_cmap.py
@@ -1,13 +1,19 @@
 """Test the pypdf_cmap module."""
 from io import BytesIO
+from pathlib import Path
 
 import pytest
 
-from pypdf import PdfReader
+from pypdf import PdfReader, PdfWriter
 from pypdf._cmap import build_char_map
+from pypdf.generic import ArrayObject, NameObject, NullObject
 
 from . import get_data_from_url
 
+TESTS_ROOT = Path(__file__).parent.resolve()
+PROJECT_ROOT = TESTS_ROOT.parent
+RESOURCE_ROOT = PROJECT_ROOT / "resources"
+
 
 @pytest.mark.enable_socket()
 @pytest.mark.slow()
@@ -206,3 +212,13 @@ def test_eten_b5():
     """Issue #2356"""
     reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
     reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")
+
+
+def test_null_missing_width():
+    """For coverage of 2792"""
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
+    page = writer.pages[0]
+    ft = page["/Resources"]["/Font"]["/F1"]
+    ft[NameObject("/Widths")] = ArrayObject()
+    ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
+    page.extract_text()
diff --git a/tests/test_page.py b/tests/test_page.py
index cb7b6c723f..72df648e45 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1131,9 +1131,9 @@ def test_merge_page_resources_smoke_test():
     # use these keys for some "operations", to validate renaming
     # (the operand name doesn't matter)
     contents1 = page1[NO("/Contents")] = ContentStream(None, None)
-    contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")]
+    contents1.operations = [(ArrayObject(props1.keys()), b"page1-contents")]
     contents2 = page2[NO("/Contents")] = ContentStream(None, None)
-    contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")]
+    contents2.operations = [(ArrayObject(props2.keys()), b"page2-contents")]
 
     expected_properties = {
         "/just1": "/just1-value",
@@ -1438,3 +1438,12 @@ def test_negative_index():
     src_abs = RESOURCE_ROOT / "git.pdf"
     reader = PdfReader(src_abs)
     assert reader.pages[0] == reader.pages[-1]
+
+
+def test_get_contents_as_bytes():
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
+    co = writer.pages[0]["/Contents"][0]
+    expected = co.get_data()
+    assert writer.pages[0]._get_contents_as_bytes() == expected
+    writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0]
+    assert writer.pages[0]._get_contents_as_bytes() == expected
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 856bedd863..a4ddff8831 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -109,22 +109,6 @@ def test_mark_location():
     Path("pypdf_pdfLocation.txt").unlink()  # cleanup
 
 
-@pytest.mark.parametrize(
-    ("input_str", "expected"),
-    [
-        ("foo", b"foo"),
-        ("😀", "😀".encode()),
-        ("‰", "‰".encode()),
-        ("▷", "▷".encode()),
-        ("世", "世".encode()),
-        # A multi-character string example with non-latin-1 characters:
-        ("😀😃", "😀😃".encode()),
-    ],
-)
-def test_b(input_str: str, expected: bytes):
-    assert pypdf._utils.b_(input_str) == expected
-
-
 def test_deprecate_no_replacement():
     with pytest.warns(DeprecationWarning) as warn:
         pypdf._utils.deprecate_no_replacement("foo", removed_in="3.0.0")
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index 93bc0c9e5e..4407b8fd5e 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -391,11 +391,11 @@ def test_merge(tmp_path, url, name):
             {
                 "/Author": "Unknown",
                 "/CreationDate": "Thursday, May 06, 1999 3:56:54 PM",
-                "/Creator": "C:DEBÆł8",
+                "/Creator": r"C:\DEB\6338",
                 "/Keywords": "",
                 "/Producer": "Acrobat PDFWriter 3.02 for Windows",
                 "/Subject": "",
-                "/Title": "C:DEBÆł8-6R.PDF",
+                "/Title": r"C:\DEB\6338-6R.PDF",
             },
         )
     ],
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 9dfeffdd89..84d84d0db6 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1290,7 +1290,7 @@ def test_attachments():
     to_add = [
         ("foobar.txt", b"foobarcontent"),
         ("foobar2.txt", b"foobarcontent2"),
-        ("foobar2.txt", b"2nd_foobarcontent"),
+        ("foobar2.txt", "2nd_foobarcontent"),
     ]
     for name, content in to_add:
         writer.add_attachment(name, content)

From 33c0d47748a49a9f22f8a7038b6c73d104cf6068 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Mon, 12 Aug 2024 11:05:59 +0200
Subject: [PATCH 18/26] improve wording

---
 docs/user/file-size.md | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index 96ce6fab56..8a103d96a3 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -9,25 +9,23 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-When adding data to a PdfWriter, the data are copied respecting the original format.
-For a example if two pages includes the same image which is duplicated, in the source document, the object will be duplicated in the PdfWriter object
+When adding data to a PdfWriter, the data is copied while respecting the original format.
+For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 
-Also when you delete objects in a document, pypdf can not easily identify weither the objects are used or not elsewhere or if the user wants to keep then in. When writing the pdf file these objects will be hidden(part of the file but not displayed) within.
+Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-in order to reduce the file size a compression process:
-`writer.compress_identical_objects(remove_identicals = True, remove_orphans= True, verbose = -1)`
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True, verbose=-1)`
 
-`remove_identical` enables / disables compression merging identical objects
+* `remove_identicals` enables/disables compression merging identical objects.
+* `remove_orphans` enables/disables suppression of unused objects.
+* `verbose` sets the value on how many objects are processed.
 
-`remove_orphans` enables / disables suppression of unused objects
+The progress status (printed on stderr) of the compression is printed as follows:
 
-`verbose` sets the value on how many objects are processed
-the progress status (printed on stderr) of the compression is printed as follow:
-* '+' during initial loop
-* '.' when replacing duplicates
-
-It is recommended to apply this process just before writing to file/stream
+* `'+'` during initial loop
+* `'.'` when replacing duplicates
 
+It is recommended to apply this process just before writing to file/stream.
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

From 661ae6403d0671a7b96580a9d158dd50f8461df6 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 18:47:12 +0200
Subject: [PATCH 19/26] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 65a1d96df9..66b91af071 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1310,7 +1310,7 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]
             else:
                 object_positions.append(-1)
                 free_objects.append(i + 1)
-        free_objects.append(0)  # add 0 to loop iaw PDF spec
+        free_objects.append(0)  # add 0 to loop in accordance with PDF spec
         return object_positions, free_objects
 
     def _write_xref_table(

From e51ba8ca5526d49c7cb54413af83d6c637c3be66 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 19:54:08 +0200
Subject: [PATCH 20/26] from review

---
 pypdf/_writer.py | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 66b91af071..d1a33bc813 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -382,7 +382,7 @@ def get_object(
             raise ValueError("pdf must be self")
         else:
             obj = self._objects[indirect_reference.idnum - 1]
-        assert obj is not None
+        assert obj is not None  # clarification for mypy
         return obj
 
     def _replace_object(
@@ -403,7 +403,7 @@ def _replace_object(
         self._objects[indirect_reference - 1] = obj
         obj.indirect_reference = IndirectObject(indirect_reference, gen, self)
 
-        assert isinstance(obj, PdfObject)
+        assert isinstance(obj, PdfObject)  # clarification for mypy
         return obj
 
     def _add_page(
@@ -1253,7 +1253,7 @@ def write_stream(self, stream: StreamType) -> None:
                 "It may not be written to correctly.",
                 __name__,
             )
-        # no more used :
+        # deprecated to be removed in pypdf 6.0.0 :
         # if not self._root:
         #   self._root = self._add_object(self._root_object)
         # self._sweep_indirect_references(self._root)
@@ -1375,18 +1375,15 @@ def compress_identical_objects(
         self,
         remove_identicals: bool = True,
         remove_orphans: bool = True,
-        verbose: int = -1,
     ) -> None:
         """
-        Parse the Pdf file and merge objects that have same harsh.
+        Parse the PDF file and merge objects that have same hash.
         This will make objects common to multiple pages
         Recommended to be used just before writing output
 
         Args:
             remove_identicals: remove of identical objects
             remove_orphans: remove of unreferenced objects
-            verbose: frequence of progress update; <0 => disable
-
         """
 
         def replace_in_obj(
@@ -1411,11 +1408,6 @@ def replace_in_obj(
 
         # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
         self._idnum_hash = {}
-        if isinstance(verbose, int):
-            cpt_init = verbose
-        else:
-            cpt_init = -1
-        cpt = cpt_init
         orphans = [True] * len(self._objects)
         # look for similar objects
         for idx, obj in enumerate(self._objects):
@@ -1423,10 +1415,6 @@ def replace_in_obj(
                 continue
             assert isinstance(obj.indirect_reference, IndirectObject)
             h = obj.hash_value()
-            if cpt == 0:
-                print("+", end="", file=sys.stderr)  # noqa: T201
-                cpt = cpt_init
-            cpt -= 1
             if remove_identicals and h in self._idnum_hash:
                 self._idnum_hash[h][1].append(obj.indirect_reference)
                 self._objects[idx] = None
@@ -1443,21 +1431,17 @@ def replace_in_obj(
         # replace reference to merged objects
         for obj in self._objects:
             if isinstance(obj, (DictionaryObject, ArrayObject)):
-                if cpt == 0:
-                    print(".", end="", file=sys.stderr)  # noqa: T201
-                    cpt = cpt_init
-                cpt -= 1
                 replace_in_obj(obj, cnv_rev)
 
         # remove orphans (if applicable)
         orphans[self.root_object.indirect_reference.idnum - 1] = False  # type: ignore
         try:
             orphans[self._info.indirect_reference.idnum - 1] = False  # type: ignore
-        except Exception:  # pragma: no cover
-            pass  # pragma: no cover
+        except AttributeError:
+            pass
         try:
             orphans[self._ID.indirect_reference.idnum - 1] = False  # type: ignore
-        except Exception:
+        except AttributeError:
             pass
         for i in compress(range(len(self._objects)), orphans):
             self._objects[i] = None

From 8d33a6989e055b6309269c1b379631975c698d64 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 19:57:04 +0200
Subject: [PATCH 21/26] aftermath from review

---
 tests/test_writer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 9ecaa25ca8..5be9b091f5 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2309,17 +2309,18 @@ def test_compress_identical_objects():
     name = "iss2794.pdf"
     in_bytes = BytesIO(get_data_from_url(url, name=name))
     writer = PdfWriter(in_bytes)
-    writer.compress_identical_objects(remove_orphans=False, verbose=100)
+    writer.compress_identical_objects(remove_orphans=False)
     out1 = BytesIO()
     writer.write(out1)
     assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue())
     writer.remove_page(
         1
     )  # page0 contains fields which keep reference to the deleted page
+    writer._info = None
     out2 = BytesIO()
     writer.write(out2)
     assert len(out1.getvalue()) - 100 < len(out2.getvalue())
-    writer.compress_identical_objects(remove_identicals=False, verbose="fake")
+    writer.compress_identical_objects(remove_identicals=False)
     out3 = BytesIO()
     writer.write(out3)
     assert len(out2.getvalue()) > len(out3.getvalue())

From ab2f2aeb184e63ec642cc08c7e82830429a7cf08 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 20:24:11 +0200
Subject: [PATCH 22/26] fix

---
 pypdf/_writer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 760cdbbb34..63c96a50f1 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -31,7 +31,6 @@
 import enum
 import hashlib
 import re
-import sys
 import uuid
 from io import BytesIO, FileIO, IOBase
 from itertools import compress
@@ -62,6 +61,7 @@
     StrByteType,
     StreamType,
     _get_max_pdf_version_header,
+    deprecate,
     deprecate_with_replacement,
     logger_warning,
 )
@@ -1425,7 +1425,6 @@ def replace_in_obj(
         cnv_rev: Dict[IndirectObject, IndirectObject] = {}
         for k, v in cnv.items():
             cnv_rev.update(zip(v, (k,) * len(v)))
-        cpt = cpt_init
 
         # replace reference to merged objects
         for obj in self._objects:

From a9758ae1736adc51cc9bdc120b11a6d451a17e74 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 21:08:21 +0200
Subject: [PATCH 23/26] MAINT: Improve test coverage (#2796)

---
 tests/test_writer.py | 10 ++++++++++
 tests/test_xmp.py    | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 84d84d0db6..eea1c6a488 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2188,6 +2188,10 @@ def test_replace_object():
     reader._replace_object(reader.pages[0].indirect_reference, reader.pages[0])
     pg = PageObject.create_blank_page(writer, 1000, 1000)
     reader._replace_object(reader.pages[0].indirect_reference, pg)
+    pg = PageObject.create_blank_page(None, 1000, 1000)
+    pg[NameObject("/Contents")] = writer.pages[0]["/Contents"]
+    writer._add_object(pg)
+    writer.add_page(pg)
 
 
 def test_mime_jupyter():
@@ -2300,3 +2304,9 @@ def test_matrix_entry_in_field_annots():
         auto_regenerate=False,
     )
     assert "/Matrix" in writer.pages[0]["/Annots"][5].get_object()["/AP"]["/N"]
+
+
+def test_set_need_appearances_writer():
+    """Minimal test for coverage"""
+    writer = PdfWriter()
+    writer.set_need_appearances_writer()
diff --git a/tests/test_xmp.py b/tests/test_xmp.py
index f864a9df9d..6615b93c8e 100644
--- a/tests/test_xmp.py
+++ b/tests/test_xmp.py
@@ -7,7 +7,7 @@
 
 import pypdf.generic
 import pypdf.xmp
-from pypdf import PdfReader
+from pypdf import PdfReader, PdfWriter
 from pypdf.errors import PdfReadError
 
 from . import get_data_from_url
@@ -42,6 +42,35 @@ def test_read_xmp_metadata_samples(src):
     }
 
 
+def test_writer_xmp_metadata_samples():
+    writer = PdfWriter(SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf")
+    xmp = writer.xmp_metadata
+    assert xmp
+    assert xmp.dc_contributor == []
+    assert xmp.dc_creator == ["John Doe"]
+    assert xmp.dc_source == "Martin Thoma"  # attribute node
+    assert xmp.dc_description == {"x-default": "This is a text"}
+    assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)]
+    assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"}
+    assert xmp.custom_properties == {
+        "Style": "FooBarStyle",
+        "other": "worlds",
+        "⏰": "time",
+    }
+    co = pypdf.generic.ContentStream(None, None)
+    co.set_data(
+        xmp.stream.get_data().replace(
+            b'dc:source="Martin Thoma"', b'dc:source="Pubpub-Zz"'
+        )
+    )
+    writer.xmp_metadata = pypdf.xmp.XmpInformation(co)
+    b = BytesIO()
+    writer.write(b)
+    reader = PdfReader(b)
+    xmp2 = reader.xmp_metadata
+    assert xmp2.dc_source == "Pubpub-Zz"
+
+
 @pytest.mark.parametrize(
     ("src", "has_xmp"),
     [

From 892ffd33ee4b3f3236be8e2ccc497b222e261d93 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 22:10:40 +0200
Subject: [PATCH 24/26] fix

---
 pypdf/_writer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 63c96a50f1..d53fd51dd6 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1433,10 +1433,9 @@ def replace_in_obj(
 
         # remove orphans (if applicable)
         orphans[self.root_object.indirect_reference.idnum - 1] = False  # type: ignore
-        try:
-            orphans[self._info.indirect_reference.idnum - 1] = False  # type: ignore
-        except AttributeError:
-            pass
+
+        orphans[self._info.indirect_reference.idnum - 1] = False  # type: ignore
+
         try:
             orphans[self._ID.indirect_reference.idnum - 1] = False  # type: ignore
         except AttributeError:

From 8a6f3bd25531fa166bc60517ce11a07d12f9a8b8 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 22:26:26 +0200
Subject: [PATCH 25/26] Update file-size.md

---
 docs/user/file-size.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index 8a103d96a3..26a5aacf86 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -14,16 +14,10 @@ For example, if two pages include the same image which is duplicated in the sour
 
 Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True, verbose=-1)`
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 
 * `remove_identicals` enables/disables compression merging identical objects.
 * `remove_orphans` enables/disables suppression of unused objects.
-* `verbose` sets the value on how many objects are processed.
-
-The progress status (printed on stderr) of the compression is printed as follows:
-
-* `'+'` during initial loop
-* `'.'` when replacing duplicates
 
 It is recommended to apply this process just before writing to file/stream.
 

From 361a291d68f5cb500158dd714ab9d107809059ef Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 12 Aug 2024 22:35:03 +0200
Subject: [PATCH 26/26] Update test_writer.py

---
 tests/test_writer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index c2b283c65d..49fe585385 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2320,7 +2320,6 @@ def test_compress_identical_objects():
     writer.remove_page(
         1
     )  # page0 contains fields which keep reference to the deleted page
-    writer._info = None
     out2 = BytesIO()
     writer.write(out2)
     assert len(out1.getvalue()) - 100 < len(out2.getvalue())
@@ -2329,7 +2328,7 @@ def test_compress_identical_objects():
     writer.write(out3)
     assert len(out2.getvalue()) > len(out3.getvalue())
 
-    
+
 def test_set_need_appearances_writer():
     """Minimal test for coverage"""
     writer = PdfWriter()