From 99f89494064b46b4d11be02b7716a2d1712ecf8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=C3=A9tous?= Date: Tue, 28 Apr 2020 13:50:45 +0200 Subject: [PATCH] Improve encoding detection of the XML TV file The encoding of the XML TV file can be found either: 1. with a regular expression on the encoding tag in the XML file 2. or using the chardet.detect() function on the content of the XML file Option 1 is improved by updating the regex: an encoding tag using single quotes will also be detected now. Option 2 could last more than several minutes for huge files. So if the file has more than 50,000 characters, the detection is done only on the first 50,000 characters (which should be enough to detect the encoding). --- main.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 2e1f2be..32a4e1a 100644 --- a/main.py +++ b/main.py @@ -2924,11 +2924,16 @@ def xmltv(): data = f.read() f.close() - match = re.search('<\?xml.*?encoding="(.*?)"',data,flags=(re.I|re.DOTALL)) + match = re.search(r'<\?xml.*?encoding=["\'](.*?)["\']',data,flags=(re.I|re.DOTALL)) if match: encoding = match.group(1) else: - chardet_encoding = chardet.detect(data) + # Improve performance by limiting the detection of the encoding + # to the first 50k characters if the XML file is bigger + if len(data) > 50000: + chardet_encoding = chardet.detect(data[:50000]) + else: + chardet_encoding = chardet.detect(data) encoding = chardet_encoding['encoding'] data = data.decode(encoding) @@ -3013,11 +3018,16 @@ def xmltv(): data = f.read() f.close() - match = re.search('<\?xml.*?encoding="(.*?)"',data,flags=(re.I|re.DOTALL)) + match = re.search(r'<\?xml.*?encoding=["\'](.*?)["\']',data,flags=(re.I|re.DOTALL)) if match: encoding = match.group(1) else: - chardet_encoding = chardet.detect(data) + # Improve performance by limiting the detection of the encoding + # to the first 50k characters if the XML file is bigger + if len(data) > 50000: + chardet_encoding = chardet.detect(data[:50000]) + else: + chardet_encoding = chardet.detect(data) encoding = chardet_encoding['encoding'] data = data.decode(encoding)