CodSpeedHQ · coco-speed · Jul 28, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -57,12 +57,12 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"]
         use-crypto-lib: ["cryptography"]
         include:
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "pycryptodome"
-          - python-version: "3.7"
+          - python-version: "3.8"
             use-crypto-lib: "none"
     steps:
     - name: Update APT packages
@@ -83,14 +83,14 @@ jobs:
         key: cache-downloaded-files
     - name: Setup Python
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
         cache-dependency-path: '**/requirements/ci.txt'
     - name: Setup Python (3.11+)
       uses: actions/setup-python@v5
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
       with:
         python-version: ${{ matrix.python-version }}
         allow-prereleases: true
@@ -102,11 +102,11 @@ jobs:
     - name: Install requirements (Python 3)
       run: |
         pip install -r requirements/ci.txt
-      if: matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
+      if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
     - name: Install requirements (Python 3.11+)
       run: |
         pip install -r requirements/ci-3.11.txt
-      if: matrix.python-version == '3.11' || matrix.python-version == '3.12'
+      if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev'
     - name: Remove pycryptodome and cryptography
       run: |
         pip uninstall pycryptodome cryptography -y

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -12,6 +12,9 @@ on:
 permissions:
   contents: write
 
+env:
+  HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
+
 jobs:
   build_and_publish:
     name: Publish a new version
@@ -24,15 +27,15 @@ jobs:
       - name: Extract version from commit message
         id: extract_version
         run: |
-          VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
+          VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+')
           echo "version=$VERSION" >> $GITHUB_OUTPUT
 
       - name: Extract tag message from commit message
         id: extract_message
         run: |
           VERSION="${{ steps.extract_version.outputs.version }}"
           delimiter="$(openssl rand -hex 8)"
-          MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" )
+          MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" )
           echo "message<<${delimiter}" >> $GITHUB_OUTPUT
           echo "$MESSAGE" >> $GITHUB_OUTPUT
           echo "${delimiter}" >> $GITHUB_OUTPUT

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr
 * [ediamondscience](https://github.com/ediamondscience)
 * [Ermeson, Felipe](https://github.com/FelipeErmeson)
 * [Freitag, François](https://github.com/francoisfreitag)
+* [Gagnon, William G.](https://github.com/williamgagnon)
 * [Górny, Michał](https://github.com/mgorny)
 * [Grillo, Miguel](https://github.com/Ineffable22)
 * [Gutteridge, David H.](https://github.com/dhgutteridge)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -9,23 +9,17 @@ Some PDF documents contain the same object multiple times. For example, if an
 image appears three times in a PDF it could be embedded three times. Or it can
 be embedded once and referenced twice.
 
-This can be done by reading and writing the file:
+When adding data to a PdfWriter, the data is copied while respecting the original format.
+For example, if two pages include the same image which is duplicated in the source document, the object will be duplicated in the PdfWriter object.
 
-```python
-from pypdf import PdfReader, PdfWriter
-
-reader = PdfReader("big-old-file.pdf")
-writer = PdfWriter()
+Additionally, when you delete objects in a document, pypdf cannot easily identify whether the objects are used elsewhere or not or if the user wants to keep them in. When writing the PDF file, these objects will be hidden within (part of the file, but not displayed).
 
-for page in reader.pages:
-    writer.add_page(page)
+In order to reduce the file size, use a compression call: `writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)`
 
-if reader.metadata is not None:
-    writer.add_metadata(reader.metadata)
+* `remove_identicals` enables/disables compression merging identical objects.
+* `remove_orphans` enables/disables suppression of unused objects.
 
-with open("smaller-new-file.pdf", "wb") as fp:
-    writer.write(fp)
-```
+It is recommended to apply this process just before writing to file/stream.
 
 It depends on the PDF how well this works, but we have seen an 86% file
 reduction (from 5.7 MB to 0.8 MB) within a real PDF.

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -3,11 +3,10 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    IndirectObject,
     NullObject,
     StreamObject,
 )
@@ -258,7 +257,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     tu = ft["/ToUnicode"]
     cm: bytes
     if isinstance(tu, StreamObject):
-        cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
     elif isinstance(tu, str) and tu.startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
@@ -448,34 +447,27 @@ def compute_space_width(
             en: int = cast(int, ft["/LastChar"])
             if st > space_code or en < space_code:
                 raise Exception("Not in range")
-            if w[space_code - st] == 0:
+            if w[space_code - st].get_object() == 0:
                 raise Exception("null width")
-            sp_width = w[space_code - st]
+            sp_width = w[space_code - st].get_object()
         except Exception:
             if "/FontDescriptor" in ft and "/MissingWidth" in cast(
                 DictionaryObject, ft["/FontDescriptor"]
             ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
+                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
             else:
                 # will consider width of char as avg(width)/2
                 m = 0
                 cpt = 0
-                for x in w:
-                    if x > 0:
-                        m += x
+                for xx in w:
+                    xx = xx.get_object()
+                    if xx > 0:
+                        m += xx
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
 
-    if isinstance(sp_width, IndirectObject):
-        # According to
-        # 'Table 122 - Entries common to all font descriptors (continued)'
-        # the MissingWidth should be a number, but according to #2286 it can
-        # be an indirect object
-        obj = sp_width.get_object()
-        if obj is None or isinstance(obj, NullObject):
-            return 0.0
-        return obj  # type: ignore
-
+    if sp_width is None or isinstance(sp_width, NullObject):
+        sp_width = 0.0
     return sp_width
 
 

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -49,7 +49,6 @@
 from ._page import PageObject, _VirtualList
 from ._page_labels import index2label as page_index2page_label
 from ._utils import (
-    b_,
     deprecate_with_replacement,
     logger_warning,
     parse_iso8824_date,
@@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
                 if isinstance(f, IndirectObject):
                     field = cast(Optional[EncodedStreamObject], f.get_object())
                     if field:
-                        es = zlib.decompress(b_(field._data))
+                        es = zlib.decompress(field._data)
                         retval[tag] = es
         return retval
 

diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
     rc4_encrypt,
 )
 
-from ._utils import b_, logger_warning
+from ._utils import logger_warning
 from .generic import (
     ArrayObject,
     ByteStringObject,
@@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2.set_data(self.stm_crypt.encrypt(b_(obj._data)))
+            obj2.set_data(self.stm_crypt.encrypt(obj._data))
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj2[key] = self.encrypt_object(value)
             obj = obj2
@@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
             data = self.str_crypt.decrypt(obj.original_bytes)
             obj = create_string_object(data)
         elif isinstance(obj, StreamObject):
-            obj._data = self.stm_crypt.decrypt(b_(obj._data))
+            obj._data = self.stm_crypt.decrypt(obj._data)
             for key, value in obj.items():  # Dont forget the Stream dict.
                 obj[key] = self.decrypt_object(value)
         elif isinstance(obj, DictionaryObject):

diff --git a/pypdf/_merger.py b/pypdf/_merger.py
@@ -46,7 +46,6 @@
 from ._utils import (
     StrByteType,
     deprecate_with_replacement,
-    str_,
 )
 from ._writer import PdfWriter
 from .constants import GoToActionArguments, TypArguments, TypFitArguments
@@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
         self.id = id
 
 
+# transfered from _utils : as this function is only required here
+# and merger will be soon deprecated
+def str_(b: Any) -> str:  # pragma: no cover
+    if isinstance(b, bytes):
+        return b.decode("latin-1")
+    else:
+        return str(b)  # will return b.__str__() if defined
+
+
 class PdfMerger:
     """
     Use :class:`PdfWriter` instead.

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,7 +28,6 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
-import sys
 from decimal import Decimal
 from pathlib import Path
 from typing import (
@@ -38,6 +37,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -85,12 +85,6 @@
     StreamObject,
 )
 
-if sys.version_info >= (3, 8):
-    from typing import Literal
-else:
-    from typing_extensions import Literal
-
-
 MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
 
 
@@ -852,7 +846,7 @@ def _add_transformation_matrix(
                     FloatObject(e),
                     FloatObject(f),
                 ],
-                " cm",
+                b"cm",
             ],
         )
         return contents
@@ -870,7 +864,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]:
             if isinstance(obj, list):
                 return b"".join(x.get_object().get_data() for x in obj)
             else:
-                return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+                return cast(EncodedStreamObject, obj).get_data()
         else:
             return None
 
@@ -1063,11 +1057,11 @@ def _merge_page(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(
@@ -1201,11 +1195,11 @@ def _merge_page_writer(
                             rect.height,
                         ],
                     ),
-                    "re",
+                    b"re",
                 ),
             )
-            page2content.operations.insert(1, ([], "W"))
-            page2content.operations.insert(2, ([], "n"))
+            page2content.operations.insert(1, ([], b"W"))
+            page2content.operations.insert(2, ([], b"n"))
             if page2transformation is not None:
                 page2content = page2transformation(page2content)
             page2content = PageObject._content_stream_rename(

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -2,13 +2,7 @@
 
 from abc import abstractmethod
 from pathlib import Path
-from typing import IO, Any, Dict, List, Optional, Tuple, Union
-
-try:
-    # Python 3.8+: https://peps.python.org/pep-0586
-    from typing import Protocol
-except ImportError:
-    from typing_extensions import Protocol  # type: ignore[assignment]
+from typing import IO, Any, Dict, List, Optional, Protocol, Tuple, Union
 
 from ._utils import StrByteType, StreamType
 

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -51,7 +51,6 @@
 from ._utils import (
     StrByteType,
     StreamType,
-    b_,
     logger_warning,
     read_non_whitespace,
     read_previous_line,
@@ -328,7 +327,7 @@ def _get_object_from_stream(
         assert cast(str, obj_stm["/Type"]) == "/ObjStm"
         # /N is the number of indirect objects in the stream
         assert idx < obj_stm["/N"]
-        stream_data = BytesIO(b_(obj_stm.get_data()))
+        stream_data = BytesIO(obj_stm.get_data())
         for i in range(obj_stm["/N"]):  # type: ignore
             read_non_whitespace(stream_data)
             stream_data.seek(-1, 1)
@@ -932,7 +931,7 @@ def _read_pdf15_xref_stream(
         xrefstream = cast(ContentStream, read_object(stream, self))
         assert cast(str, xrefstream["/Type"]) == "/XRef"
         self.cache_indirect_object(generation, idnum, xrefstream)
-        stream_data = BytesIO(b_(xrefstream.get_data()))
+        stream_data = BytesIO(xrefstream.get_data())
         # Index pairs specify the subsections in the dictionary. If
         # none create one subsection that spans everything.
         idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])

diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -1,22 +1,16 @@
 """Extract PDF text preserving the layout of the source PDF"""
 
-import sys
 from itertools import groupby
 from math import ceil
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, TypedDict
 
 from ..._utils import logger_warning
 from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
 from ._font import Font
 from ._text_state_manager import TextStateManager
 from ._text_state_params import TextStateParams
 
-if sys.version_info >= (3, 8):
-    from typing import Literal, TypedDict
-else:
-    from typing_extensions import Literal, TypedDict
-
 
 class BTGroup(TypedDict):
     """