AWS-DDNS/Function Source Files/charset_normalizer/api.py

import logging
from os import PathLike
from typing import BinaryIO, List, Optional, Set, Union

from .cd import (
    coherence_ratio,
    encoding_languages,
    mb_encoding_languages,
    merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
    any_specified_encoding,
    cut_sequence_chunks,
    iana_name,
    identify_sig_or_bom,
    is_cp_similar,
    is_multi_byte_encoding,
    should_strip_sig_or_bom,
)

# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
    logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)


def from_bytes(
    sequences: Union[bytes, bytearray],
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.2,
    cp_isolation: Optional[List[str]] = None,
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
    language_threshold: float = 0.1,
    enable_fallback: bool = True,
) -> CharsetMatches:
    """
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    """

    if not isinstance(sequences, (bytearray, bytes)):
        raise TypeError(
            "Expected object of type bytes or bytearray, got: {0}".format(
                type(sequences)
            )
        )

    if explain:
        previous_logger_level: int = logger.level
        logger.addHandler(explain_handler)
        logger.setLevel(TRACE)

    length: int = len(sequences)

    if length == 0:
        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
        if explain:
            logger.removeHandler(explain_handler)
            logger.setLevel(previous_logger_level or logging.WARNING)
        return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

    if cp_isolation is not None:
        logger.log(
            TRACE,
            "cp_isolation is set. use this flag for debugging purpose. "
            "limited list of encoding allowed : %s.",
            ", ".join(cp_isolation),
        )
        cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
    else:
        cp_isolation = []

    if cp_exclusion is not None:
        logger.log(
            TRACE,
            "cp_exclusion is set. use this flag for debugging purpose. "
            "limited list of encoding excluded : %s.",
            ", ".join(cp_exclusion),
        )
        cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
    else:
        cp_exclusion = []

    if length <= (chunk_size * steps):
        logger.log(
            TRACE,
            "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
            steps,
            chunk_size,
            length,
        )
        steps = 1
        chunk_size = length

    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)

    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE

    if is_too_small_sequence:
        logger.log(
            TRACE,
            "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
                length
            ),
        )
    elif is_too_large_sequence:
        logger.log(
            TRACE,
            "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
                length
            ),
        )

    prioritized_encodings: List[str] = []

    specified_encoding: Optional[str] = (
        any_specified_encoding(sequences) if preemptive_behaviour else None
    )

    if specified_encoding is not None:
        prioritized_encodings.append(specified_encoding)
        logger.log(
            TRACE,
            "Detected declarative mark in sequence. Priority +1 given for %s.",
            specified_encoding,
        )

    tested: Set[str] = set()
    tested_but_hard_failure: List[str] = []
    tested_but_soft_failure: List[str] = []

    fallback_ascii: Optional[CharsetMatch] = None
    fallback_u8: Optional[CharsetMatch] = None
    fallback_specified: Optional[CharsetMatch] = None

    results: CharsetMatches = CharsetMatches()

    sig_encoding, sig_payload = identify_sig_or_bom(sequences)

    if sig_encoding is not None:
        prioritized_encodings.append(sig_encoding)
        logger.log(
            TRACE,
            "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
            len(sig_payload),
            sig_encoding,
        )

    prioritized_encodings.append("ascii")

    if "utf_8" not in prioritized_encodings:
        prioritized_encodings.append("utf_8")

    for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
        if cp_isolation and encoding_iana not in cp_isolation:
            continue

        if cp_exclusion and encoding_iana in cp_exclusion:
            continue

        if encoding_iana in tested:
            continue

        tested.add(encoding_iana)

        decoded_payload: Optional[str] = None
        bom_or_sig_available: bool = sig_encoding == encoding_iana
        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
            encoding_iana
        )

        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.log(
                TRACE,
                "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
                encoding_iana,
            )
            continue
        if encoding_iana in {"utf_7"} and not bom_or_sig_available:
            logger.log(
                TRACE,
                "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
                encoding_iana,
            )
            continue

        try:
            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
        except (ModuleNotFoundError, ImportError):
            logger.log(
                TRACE,
                "Encoding %s does not provide an IncrementalDecoder",
                encoding_iana,
            )
            continue

        try:
            if is_too_large_sequence and is_multi_byte_decoder is False:
                str(
                    sequences[: int(50e4)]
                    if strip_sig_or_bom is False
                    else sequences[len(sig_payload) : int(50e4)],
                    encoding=encoding_iana,
                )
            else:
                decoded_payload = str(
                    sequences
                    if strip_sig_or_bom is False
                    else sequences[len(sig_payload) :],
                    encoding=encoding_iana,
                )
        except (UnicodeDecodeError, LookupError) as e:
            if not isinstance(e, LookupError):
                logger.log(
                    TRACE,
                    "Code page %s does not fit given bytes sequence at ALL. %s",
                    encoding_iana,
                    str(e),
                )
            tested_but_hard_failure.append(encoding_iana)
            continue

        similar_soft_failure_test: bool = False

        for encoding_soft_failed in tested_but_soft_failure:
            if is_cp_similar(encoding_iana, encoding_soft_failed):
                similar_soft_failure_test = True
                break

        if similar_soft_failure_test:
            logger.log(
                TRACE,
                "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
                encoding_iana,
                encoding_soft_failed,
            )
            continue

        r_ = range(
            0 if not bom_or_sig_available else len(sig_payload),
            length,
            int(length / steps),
        )

        multi_byte_bonus: bool = (
            is_multi_byte_decoder
            and decoded_payload is not None
            and len(decoded_payload) < length
        )

        if multi_byte_bonus:
            logger.log(
                TRACE,
                "Code page %s is a multi byte encoding table and it appear that at least one character "
                "was encoded using n-bytes.",
                encoding_iana,
            )

        max_chunk_gave_up: int = int(len(r_) / 4)

        max_chunk_gave_up = max(max_chunk_gave_up, 2)
        early_stop_count: int = 0
        lazy_str_hard_failure = False

        md_chunks: List[str] = []
        md_ratios = []

        try:
            for chunk in cut_sequence_chunks(
                sequences,
                encoding_iana,
                r_,
                chunk_size,
                bom_or_sig_available,
                strip_sig_or_bom,
                sig_payload,
                is_multi_byte_decoder,
                decoded_payload,
            ):
                md_chunks.append(chunk)

                md_ratios.append(
                    mess_ratio(
                        chunk,
                        threshold,
                        explain is True and 1 <= len(cp_isolation) <= 2,
                    )
                )

                if md_ratios[-1] >= threshold:
                    early_stop_count += 1

                if (early_stop_count >= max_chunk_gave_up) or (
                    bom_or_sig_available and strip_sig_or_bom is False
                ):
                    break
        except (
            UnicodeDecodeError
        ) as e:  # Lazy str loading may have missed something there
            logger.log(
                TRACE,
                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
                encoding_iana,
                str(e),
            )
            early_stop_count = max_chunk_gave_up
            lazy_str_hard_failure = True

        # We might want to check the sequence again with the whole content
        # Only if initial MD tests passes
        if (
            not lazy_str_hard_failure
            and is_too_large_sequence
            and not is_multi_byte_decoder
        ):
            try:
                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
            except UnicodeDecodeError as e:
                logger.log(
                    TRACE,
                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
                    encoding_iana,
                    str(e),
                )
                tested_but_hard_failure.append(encoding_iana)
                continue

        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
            logger.log(
                TRACE,
                "%s was excluded because of initial chaos probing. Gave up %i time(s). "
                "Computed mean chaos is %f %%.",
                encoding_iana,
                early_stop_count,
                round(mean_mess_ratio * 100, ndigits=3),
            )
            # Preparing those fallbacks in case we got nothing.
            if (
                enable_fallback
                and encoding_iana in ["ascii", "utf_8", specified_encoding]
                and not lazy_str_hard_failure
            ):
                fallback_entry = CharsetMatch(
                    sequences, encoding_iana, threshold, False, [], decoded_payload
                )
                if encoding_iana == specified_encoding:
                    fallback_specified = fallback_entry
                elif encoding_iana == "ascii":
                    fallback_ascii = fallback_entry
                else:
                    fallback_u8 = fallback_entry
            continue

        logger.log(
            TRACE,
            "%s passed initial chaos probing. Mean measured chaos is %f %%",
            encoding_iana,
            round(mean_mess_ratio * 100, ndigits=3),
        )

        if not is_multi_byte_decoder:
            target_languages: List[str] = encoding_languages(encoding_iana)
        else:
            target_languages = mb_encoding_languages(encoding_iana)

        if target_languages:
            logger.log(
                TRACE,
                "{} should target any language(s) of {}".format(
                    encoding_iana, str(target_languages)
                ),
            )

        cd_ratios = []

        # We shall skip the CD when its about ASCII
        # Most of the time its not relevant to run "language-detection" on it.
        if encoding_iana != "ascii":
            for chunk in md_chunks:
                chunk_languages = coherence_ratio(
                    chunk,
                    language_threshold,
                    ",".join(target_languages) if target_languages else None,
                )

                cd_ratios.append(chunk_languages)

        cd_ratios_merged = merge_coherence_ratios(cd_ratios)

        if cd_ratios_merged:
            logger.log(
                TRACE,
                "We detected language {} using {}".format(
                    cd_ratios_merged, encoding_iana
                ),
            )

        results.append(
            CharsetMatch(
                sequences,
                encoding_iana,
                mean_mess_ratio,
                bom_or_sig_available,
                cd_ratios_merged,
                decoded_payload,
            )
        )

        if (
            encoding_iana in [specified_encoding, "ascii", "utf_8"]
            and mean_mess_ratio < 0.1
        ):
            logger.debug(
                "Encoding detection: %s is most likely the one.", encoding_iana
            )
            if explain:
                logger.removeHandler(explain_handler)
                logger.setLevel(previous_logger_level)
            return CharsetMatches([results[encoding_iana]])

        if encoding_iana == sig_encoding:
            logger.debug(
                "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
                "the beginning of the sequence.",
                encoding_iana,
            )
            if explain:
                logger.removeHandler(explain_handler)
                logger.setLevel(previous_logger_level)
            return CharsetMatches([results[encoding_iana]])

    if len(results) == 0:
        if fallback_u8 or fallback_ascii or fallback_specified:
            logger.log(
                TRACE,
                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
            )

        if fallback_specified:
            logger.debug(
                "Encoding detection: %s will be used as a fallback match",
                fallback_specified.encoding,
            )
            results.append(fallback_specified)
        elif (
            (fallback_u8 and fallback_ascii is None)
            or (
                fallback_u8
                and fallback_ascii
                and fallback_u8.fingerprint != fallback_ascii.fingerprint
            )
            or (fallback_u8 is not None)
        ):
            logger.debug("Encoding detection: utf_8 will be used as a fallback match")
            results.append(fallback_u8)
        elif fallback_ascii:
            logger.debug("Encoding detection: ascii will be used as a fallback match")
            results.append(fallback_ascii)

    if results:
        logger.debug(
            "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
            results.best().encoding,  # type: ignore
            len(results) - 1,
        )
    else:
        logger.debug("Encoding detection: Unable to determine any suitable charset.")

    if explain:
        logger.removeHandler(explain_handler)
        logger.setLevel(previous_logger_level)

    return results


def from_fp(
    fp: BinaryIO,
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
    cp_isolation: Optional[List[str]] = None,
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
    language_threshold: float = 0.1,
    enable_fallback: bool = True,
) -> CharsetMatches:
    """
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    """
    return from_bytes(
        fp.read(),
        steps,
        chunk_size,
        threshold,
        cp_isolation,
        cp_exclusion,
        preemptive_behaviour,
        explain,
        language_threshold,
        enable_fallback,
    )


def from_path(
    path: Union[str, bytes, PathLike],  # type: ignore[type-arg]
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
    cp_isolation: Optional[List[str]] = None,
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
    language_threshold: float = 0.1,
    enable_fallback: bool = True,
) -> CharsetMatches:
    """
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    """
    with open(path, "rb") as fp:
        return from_fp(
            fp,
            steps,
            chunk_size,
            threshold,
            cp_isolation,
            cp_exclusion,
            preemptive_behaviour,
            explain,
            language_threshold,
            enable_fallback,
        )


def is_binary(
    fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes],  # type: ignore[type-arg]
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
    cp_isolation: Optional[List[str]] = None,
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
    language_threshold: float = 0.1,
    enable_fallback: bool = False,
) -> bool:
    """
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    """
    if isinstance(fp_or_path_or_payload, (str, PathLike)):
        guesses = from_path(
            fp_or_path_or_payload,
            steps=steps,
            chunk_size=chunk_size,
            threshold=threshold,
            cp_isolation=cp_isolation,
            cp_exclusion=cp_exclusion,
            preemptive_behaviour=preemptive_behaviour,
            explain=explain,
            language_threshold=language_threshold,
            enable_fallback=enable_fallback,
        )
    elif isinstance(
        fp_or_path_or_payload,
        (
            bytes,
            bytearray,
        ),
    ):
        guesses = from_bytes(
            fp_or_path_or_payload,
            steps=steps,
            chunk_size=chunk_size,
            threshold=threshold,
            cp_isolation=cp_isolation,
            cp_exclusion=cp_exclusion,
            preemptive_behaviour=preemptive_behaviour,
            explain=explain,
            language_threshold=language_threshold,
            enable_fallback=enable_fallback,
        )
    else:
        guesses = from_fp(
            fp_or_path_or_payload,
            steps=steps,
            chunk_size=chunk_size,
            threshold=threshold,
            cp_isolation=cp_isolation,
            cp_exclusion=cp_exclusion,
            preemptive_behaviour=preemptive_behaviour,
            explain=explain,
            language_threshold=language_threshold,
            enable_fallback=enable_fallback,
        )

    return not guesses
Inital Commit 6 months ago			`import logging`
			`from os import PathLike`
			`from typing import BinaryIO, List, Optional, Set, Union`

			`from .cd import (`
			`coherence_ratio,`
			`encoding_languages,`
			`mb_encoding_languages,`
			`merge_coherence_ratios,`
			`)`
			`from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE`
			`from .md import mess_ratio`
			`from .models import CharsetMatch, CharsetMatches`
			`from .utils import (`
			`any_specified_encoding,`
			`cut_sequence_chunks,`
			`iana_name,`
			`identify_sig_or_bom,`
			`is_cp_similar,`
			`is_multi_byte_encoding,`
			`should_strip_sig_or_bom,`
			`)`

			`# Will most likely be controversial`
			`# logging.addLevelName(TRACE, "TRACE")`
			`logger = logging.getLogger("charset_normalizer")`
			`explain_handler = logging.StreamHandler()`
			`explain_handler.setFormatter(`
			`logging.Formatter("%(asctime)s \| %(levelname)s \| %(message)s")`
			`)`


			`def from_bytes(`
			`sequences: Union[bytes, bytearray],`
			`steps: int = 5,`
			`chunk_size: int = 512,`
			`threshold: float = 0.2,`
			`cp_isolation: Optional[List[str]] = None,`
			`cp_exclusion: Optional[List[str]] = None,`
			`preemptive_behaviour: bool = True,`
			`explain: bool = False,`
			`language_threshold: float = 0.1,`
			`enable_fallback: bool = True,`
			`) -> CharsetMatches:`
			`"""`
			`Given a raw bytes sequence, return the best possibles charset usable to render str objects.`
			`If there is no results, it is a strong indicator that the source is binary/not text.`
			`By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.`
			`And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.`

			`The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page`
			`but never take it for granted. Can improve the performance.`

			`You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that`
			`purpose.`

			`This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.`
			`By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'`
			`toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.`
			`Custom logging format and handler can be set manually.`
			`"""`

			`if not isinstance(sequences, (bytearray, bytes)):`
			`raise TypeError(`
			`"Expected object of type bytes or bytearray, got: {0}".format(`
			`type(sequences)`
			`)`
			`)`

			`if explain:`
			`previous_logger_level: int = logger.level`
			`logger.addHandler(explain_handler)`
			`logger.setLevel(TRACE)`

			`length: int = len(sequences)`

			`if length == 0:`
			`logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")`
			`if explain:`
			`logger.removeHandler(explain_handler)`
			`logger.setLevel(previous_logger_level or logging.WARNING)`
			`return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])`

			`if cp_isolation is not None:`
			`logger.log(`
			`TRACE,`
			`"cp_isolation is set. use this flag for debugging purpose. "`
			`"limited list of encoding allowed : %s.",`
			`", ".join(cp_isolation),`
			`)`
			`cp_isolation = [iana_name(cp, False) for cp in cp_isolation]`
			`else:`
			`cp_isolation = []`

			`if cp_exclusion is not None:`
			`logger.log(`
			`TRACE,`
			`"cp_exclusion is set. use this flag for debugging purpose. "`
			`"limited list of encoding excluded : %s.",`
			`", ".join(cp_exclusion),`
			`)`
			`cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]`
			`else:`
			`cp_exclusion = []`

			`if length <= (chunk_size * steps):`
			`logger.log(`
			`TRACE,`
			`"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",`
			`steps,`
			`chunk_size,`
			`length,`
			`)`
			`steps = 1`
			`chunk_size = length`

			`if steps > 1 and length / steps < chunk_size:`
			`chunk_size = int(length / steps)`

			`is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE`
			`is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE`

			`if is_too_small_sequence:`
			`logger.log(`
			`TRACE,`
			`"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(`
			`length`
			`),`
			`)`
			`elif is_too_large_sequence:`
			`logger.log(`
			`TRACE,`
			`"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(`
			`length`
			`),`
			`)`

			`prioritized_encodings: List[str] = []`

			`specified_encoding: Optional[str] = (`
			`any_specified_encoding(sequences) if preemptive_behaviour else None`
			`)`

			`if specified_encoding is not None:`
			`prioritized_encodings.append(specified_encoding)`
			`logger.log(`
			`TRACE,`
			`"Detected declarative mark in sequence. Priority +1 given for %s.",`
			`specified_encoding,`
			`)`

			`tested: Set[str] = set()`
			`tested_but_hard_failure: List[str] = []`
			`tested_but_soft_failure: List[str] = []`

			`fallback_ascii: Optional[CharsetMatch] = None`
			`fallback_u8: Optional[CharsetMatch] = None`
			`fallback_specified: Optional[CharsetMatch] = None`

			`results: CharsetMatches = CharsetMatches()`

			`sig_encoding, sig_payload = identify_sig_or_bom(sequences)`

			`if sig_encoding is not None:`
			`prioritized_encodings.append(sig_encoding)`
			`logger.log(`
			`TRACE,`
			`"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",`
			`len(sig_payload),`
			`sig_encoding,`
			`)`

			`prioritized_encodings.append("ascii")`

			`if "utf_8" not in prioritized_encodings:`
			`prioritized_encodings.append("utf_8")`

			`for encoding_iana in prioritized_encodings + IANA_SUPPORTED:`
			`if cp_isolation and encoding_iana not in cp_isolation:`
			`continue`

			`if cp_exclusion and encoding_iana in cp_exclusion:`
			`continue`

			`if encoding_iana in tested:`
			`continue`

			`tested.add(encoding_iana)`

			`decoded_payload: Optional[str] = None`
			`bom_or_sig_available: bool = sig_encoding == encoding_iana`
			`strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(`
			`encoding_iana`
			`)`

			`if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:`
			`logger.log(`
			`TRACE,`
			`"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",`
			`encoding_iana,`
			`)`
			`continue`
			`if encoding_iana in {"utf_7"} and not bom_or_sig_available:`
			`logger.log(`
			`TRACE,`
			`"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",`
			`encoding_iana,`
			`)`
			`continue`

			`try:`
			`is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)`
			`except (ModuleNotFoundError, ImportError):`
			`logger.log(`
			`TRACE,`
			`"Encoding %s does not provide an IncrementalDecoder",`
			`encoding_iana,`
			`)`
			`continue`

			`try:`
			`if is_too_large_sequence and is_multi_byte_decoder is False:`
			`str(`
			`sequences[: int(50e4)]`
			`if strip_sig_or_bom is False`
			`else sequences[len(sig_payload) : int(50e4)],`
			`encoding=encoding_iana,`
			`)`
			`else:`
			`decoded_payload = str(`
			`sequences`
			`if strip_sig_or_bom is False`
			`else sequences[len(sig_payload) :],`
			`encoding=encoding_iana,`
			`)`
			`except (UnicodeDecodeError, LookupError) as e:`
			`if not isinstance(e, LookupError):`
			`logger.log(`
			`TRACE,`
			`"Code page %s does not fit given bytes sequence at ALL. %s",`
			`encoding_iana,`
			`str(e),`
			`)`
			`tested_but_hard_failure.append(encoding_iana)`
			`continue`

			`similar_soft_failure_test: bool = False`

			`for encoding_soft_failed in tested_but_soft_failure:`
			`if is_cp_similar(encoding_iana, encoding_soft_failed):`
			`similar_soft_failure_test = True`
			`break`

			`if similar_soft_failure_test:`
			`logger.log(`
			`TRACE,`
			`"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",`
			`encoding_iana,`
			`encoding_soft_failed,`
			`)`
			`continue`

			`r_ = range(`
			`0 if not bom_or_sig_available else len(sig_payload),`
			`length,`
			`int(length / steps),`
			`)`

			`multi_byte_bonus: bool = (`
			`is_multi_byte_decoder`
			`and decoded_payload is not None`
			`and len(decoded_payload) < length`
			`)`

			`if multi_byte_bonus:`
			`logger.log(`
			`TRACE,`
			`"Code page %s is a multi byte encoding table and it appear that at least one character "`
			`"was encoded using n-bytes.",`
			`encoding_iana,`
			`)`

			`max_chunk_gave_up: int = int(len(r_) / 4)`

			`max_chunk_gave_up = max(max_chunk_gave_up, 2)`
			`early_stop_count: int = 0`
			`lazy_str_hard_failure = False`

			`md_chunks: List[str] = []`
			`md_ratios = []`

			`try:`
			`for chunk in cut_sequence_chunks(`
			`sequences,`
			`encoding_iana,`
			`r_,`
			`chunk_size,`
			`bom_or_sig_available,`
			`strip_sig_or_bom,`
			`sig_payload,`
			`is_multi_byte_decoder,`
			`decoded_payload,`
			`):`
			`md_chunks.append(chunk)`

			`md_ratios.append(`
			`mess_ratio(`
			`chunk,`
			`threshold,`
			`explain is True and 1 <= len(cp_isolation) <= 2,`
			`)`
			`)`

			`if md_ratios[-1] >= threshold:`
			`early_stop_count += 1`

			`if (early_stop_count >= max_chunk_gave_up) or (`
			`bom_or_sig_available and strip_sig_or_bom is False`
			`):`
			`break`
			`except (`
			`UnicodeDecodeError`
			`) as e: # Lazy str loading may have missed something there`
			`logger.log(`
			`TRACE,`
			`"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",`
			`encoding_iana,`
			`str(e),`
			`)`
			`early_stop_count = max_chunk_gave_up`
			`lazy_str_hard_failure = True`

			`# We might want to check the sequence again with the whole content`
			`# Only if initial MD tests passes`
			`if (`
			`not lazy_str_hard_failure`
			`and is_too_large_sequence`
			`and not is_multi_byte_decoder`
			`):`
			`try:`
			`sequences[int(50e3) :].decode(encoding_iana, errors="strict")`
			`except UnicodeDecodeError as e:`
			`logger.log(`
			`TRACE,`
			`"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",`
			`encoding_iana,`
			`str(e),`
			`)`
			`tested_but_hard_failure.append(encoding_iana)`
			`continue`

			`mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0`
			`if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:`
			`tested_but_soft_failure.append(encoding_iana)`
			`logger.log(`
			`TRACE,`
			`"%s was excluded because of initial chaos probing. Gave up %i time(s). "`
			`"Computed mean chaos is %f %%.",`
			`encoding_iana,`
			`early_stop_count,`
			`round(mean_mess_ratio * 100, ndigits=3),`
			`)`
			`# Preparing those fallbacks in case we got nothing.`
			`if (`
			`enable_fallback`
			`and encoding_iana in ["ascii", "utf_8", specified_encoding]`
			`and not lazy_str_hard_failure`
			`):`
			`fallback_entry = CharsetMatch(`
			`sequences, encoding_iana, threshold, False, [], decoded_payload`
			`)`
			`if encoding_iana == specified_encoding:`
			`fallback_specified = fallback_entry`
			`elif encoding_iana == "ascii":`
			`fallback_ascii = fallback_entry`
			`else:`
			`fallback_u8 = fallback_entry`
			`continue`

			`logger.log(`
			`TRACE,`
			`"%s passed initial chaos probing. Mean measured chaos is %f %%",`
			`encoding_iana,`
			`round(mean_mess_ratio * 100, ndigits=3),`
			`)`

			`if not is_multi_byte_decoder:`
			`target_languages: List[str] = encoding_languages(encoding_iana)`
			`else:`
			`target_languages = mb_encoding_languages(encoding_iana)`

			`if target_languages:`
			`logger.log(`
			`TRACE,`
			`"{} should target any language(s) of {}".format(`
			`encoding_iana, str(target_languages)`
			`),`
			`)`

			`cd_ratios = []`

			`# We shall skip the CD when its about ASCII`
			`# Most of the time its not relevant to run "language-detection" on it.`
			`if encoding_iana != "ascii":`
			`for chunk in md_chunks:`
			`chunk_languages = coherence_ratio(`
			`chunk,`
			`language_threshold,`
			`",".join(target_languages) if target_languages else None,`
			`)`

			`cd_ratios.append(chunk_languages)`

			`cd_ratios_merged = merge_coherence_ratios(cd_ratios)`

			`if cd_ratios_merged:`
			`logger.log(`
			`TRACE,`
			`"We detected language {} using {}".format(`
			`cd_ratios_merged, encoding_iana`
			`),`
			`)`

			`results.append(`
			`CharsetMatch(`
			`sequences,`
			`encoding_iana,`
			`mean_mess_ratio,`
			`bom_or_sig_available,`
			`cd_ratios_merged,`
			`decoded_payload,`
			`)`
			`)`

			`if (`
			`encoding_iana in [specified_encoding, "ascii", "utf_8"]`
			`and mean_mess_ratio < 0.1`
			`):`
			`logger.debug(`
			`"Encoding detection: %s is most likely the one.", encoding_iana`
			`)`
			`if explain:`
			`logger.removeHandler(explain_handler)`
			`logger.setLevel(previous_logger_level)`
			`return CharsetMatches([results[encoding_iana]])`

			`if encoding_iana == sig_encoding:`
			`logger.debug(`
			`"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "`
			`"the beginning of the sequence.",`
			`encoding_iana,`
			`)`
			`if explain:`
			`logger.removeHandler(explain_handler)`
			`logger.setLevel(previous_logger_level)`
			`return CharsetMatches([results[encoding_iana]])`

			`if len(results) == 0:`
			`if fallback_u8 or fallback_ascii or fallback_specified:`
			`logger.log(`
			`TRACE,`
			`"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",`
			`)`

			`if fallback_specified:`
			`logger.debug(`
			`"Encoding detection: %s will be used as a fallback match",`
			`fallback_specified.encoding,`
			`)`
			`results.append(fallback_specified)`
			`elif (`
			`(fallback_u8 and fallback_ascii is None)`
			`or (`
			`fallback_u8`
			`and fallback_ascii`
			`and fallback_u8.fingerprint != fallback_ascii.fingerprint`
			`)`
			`or (fallback_u8 is not None)`
			`):`
			`logger.debug("Encoding detection: utf_8 will be used as a fallback match")`
			`results.append(fallback_u8)`
			`elif fallback_ascii:`
			`logger.debug("Encoding detection: ascii will be used as a fallback match")`
			`results.append(fallback_ascii)`

			`if results:`
			`logger.debug(`
			`"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",`
			`results.best().encoding, # type: ignore`
			`len(results) - 1,`
			`)`
			`else:`
			`logger.debug("Encoding detection: Unable to determine any suitable charset.")`

			`if explain:`
			`logger.removeHandler(explain_handler)`
			`logger.setLevel(previous_logger_level)`

			`return results`


			`def from_fp(`
			`fp: BinaryIO,`
			`steps: int = 5,`
			`chunk_size: int = 512,`
			`threshold: float = 0.20,`
			`cp_isolation: Optional[List[str]] = None,`
			`cp_exclusion: Optional[List[str]] = None,`
			`preemptive_behaviour: bool = True,`
			`explain: bool = False,`
			`language_threshold: float = 0.1,`
			`enable_fallback: bool = True,`
			`) -> CharsetMatches:`
			`"""`
			`Same thing than the function from_bytes but using a file pointer that is already ready.`
			`Will not close the file pointer.`
			`"""`
			`return from_bytes(`
			`fp.read(),`
			`steps,`
			`chunk_size,`
			`threshold,`
			`cp_isolation,`
			`cp_exclusion,`
			`preemptive_behaviour,`
			`explain,`
			`language_threshold,`
			`enable_fallback,`
			`)`


			`def from_path(`
			`path: Union[str, bytes, PathLike], # type: ignore[type-arg]`
			`steps: int = 5,`
			`chunk_size: int = 512,`
			`threshold: float = 0.20,`
			`cp_isolation: Optional[List[str]] = None,`
			`cp_exclusion: Optional[List[str]] = None,`
			`preemptive_behaviour: bool = True,`
			`explain: bool = False,`
			`language_threshold: float = 0.1,`
			`enable_fallback: bool = True,`
			`) -> CharsetMatches:`
			`"""`
			`Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.`
			`Can raise IOError.`
			`"""`
			`with open(path, "rb") as fp:`
			`return from_fp(`
			`fp,`
			`steps,`
			`chunk_size,`
			`threshold,`
			`cp_isolation,`
			`cp_exclusion,`
			`preemptive_behaviour,`
			`explain,`
			`language_threshold,`
			`enable_fallback,`
			`)`


			`def is_binary(`
			`fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]`
			`steps: int = 5,`
			`chunk_size: int = 512,`
			`threshold: float = 0.20,`
			`cp_isolation: Optional[List[str]] = None,`
			`cp_exclusion: Optional[List[str]] = None,`
			`preemptive_behaviour: bool = True,`
			`explain: bool = False,`
			`language_threshold: float = 0.1,`
			`enable_fallback: bool = False,`
			`) -> bool:`
			`"""`
			`Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.`
			`Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match`
			`are disabled to be stricter around ASCII-compatible but unlikely to be a string.`
			`"""`
			`if isinstance(fp_or_path_or_payload, (str, PathLike)):`
			`guesses = from_path(`
			`fp_or_path_or_payload,`
			`steps=steps,`
			`chunk_size=chunk_size,`
			`threshold=threshold,`
			`cp_isolation=cp_isolation,`
			`cp_exclusion=cp_exclusion,`
			`preemptive_behaviour=preemptive_behaviour,`
			`explain=explain,`
			`language_threshold=language_threshold,`
			`enable_fallback=enable_fallback,`
			`)`
			`elif isinstance(`
			`fp_or_path_or_payload,`
			`(`
			`bytes,`
			`bytearray,`
			`),`
			`):`
			`guesses = from_bytes(`
			`fp_or_path_or_payload,`
			`steps=steps,`
			`chunk_size=chunk_size,`
			`threshold=threshold,`
			`cp_isolation=cp_isolation,`
			`cp_exclusion=cp_exclusion,`
			`preemptive_behaviour=preemptive_behaviour,`
			`explain=explain,`
			`language_threshold=language_threshold,`
			`enable_fallback=enable_fallback,`
			`)`
			`else:`
			`guesses = from_fp(`
			`fp_or_path_or_payload,`
			`steps=steps,`
			`chunk_size=chunk_size,`
			`threshold=threshold,`
			`cp_isolation=cp_isolation,`
			`cp_exclusion=cp_exclusion,`
			`preemptive_behaviour=preemptive_behaviour,`
			`explain=explain,`
			`language_threshold=language_threshold,`
			`enable_fallback=enable_fallback,`
			`)`

			`return not guesses`