2.0

author: blackhao <13851610112@163.com> 2025-08-22 02:51:50 -0500
committer: blackhao <13851610112@163.com> 2025-08-22 02:51:50 -0500
commit: 4aab4087dc97906d0b9890035401175cdaab32d4 (patch)
tree: 4e2e9d88a711ec5b1cfa02e8ac72a55183b99123 /.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py
parent: afa8f50d1d21c721dabcb31ad244610946ab65a3 (diff)
1 files changed, 80 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py b/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py
new file mode 100644
index 0000000..360a310
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from warnings import warn
+
+from .api import from_bytes
+from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
+
+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: str | None
+        language: str
+        confidence: float | None
+
+
+def detect(
+    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
+) -> ResultDict:
+    """
+    chardet legacy method
+    Detect the encoding of the given byte string. It should be mostly backward-compatible.
+    Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
+    This function is deprecated and should be used to migrate your project easily, consult the documentation for
+    further information. Not planned for removal.
+
+    :param byte_str:     The byte sequence to examine.
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    """
+    if len(kwargs):
+        warn(
+            f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
+        )
+
+    if not isinstance(byte_str, (bytearray, bytes)):
+        raise TypeError(  # pragma: nocover
+            f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
+        )
+
+    if isinstance(byte_str, bytearray):
+        byte_str = bytes(byte_str)
+
+    r = from_bytes(byte_str).best()
+
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != "Unknown" else ""
+    confidence = 1.0 - r.chaos if r is not None else None
+
+    # automatically lower confidence
+    # on small bytes samples.
+    # https://github.com/jawah/charset_normalizer/issues/391
+    if (
+        confidence is not None
+        and confidence >= 0.9
+        and encoding
+        not in {
+            "utf_8",
+            "ascii",
+        }
+        and r.bom is False  # type: ignore[union-attr]
+        and len(byte_str) < TOO_SMALL_SEQUENCE
+    ):
+        confidence -= 0.2
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if r is not None and encoding == "utf_8" and r.bom:
+        encoding += "_sig"
+
+    if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
+        encoding = CHARDET_CORRESPONDENCE[encoding]
+
+    return {
+        "encoding": encoding,
+        "language": language,
+        "confidence": confidence,
+    }
author	blackhao <13851610112@163.com>	2025-08-22 02:51:50 -0500
committer	blackhao <13851610112@163.com>	2025-08-22 02:51:50 -0500
commit	4aab4087dc97906d0b9890035401175cdaab32d4 (patch)
tree	4e2e9d88a711ec5b1cfa02e8ac72a55183b99123 /.venv/lib/python3.12/site-packages/charset_normalizer/legacy.py
parent	afa8f50d1d21c721dabcb31ad244610946ab65a3 (diff)