# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2020 Tuomas Siipola <tuomas@zpl.fi>
# SPDX-FileCopyrightText: 2022 Carmen Bianca Bakker <carmenbianca@fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Nico Rikken <nico.rikken@fsfe.org>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Johannes Zarl-Zierl <johannes@zarl-zierl.at>
# SPDX-FileCopyrightText: 2024 Rivos Inc.
# SPDX-FileCopyrightText: 2024 Skyler Grey <sky@a.starrysky.fyi>
# SPDX-FileCopyrightText: © 2020 Liferay, Inc. <https://liferay.com>
# SPDX-FileCopyrightText: 2025 Simon Barth <simon.barth@gmx.de>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Utilities related to the extraction of REUSE information out of files."""
import codecs
import contextlib
import importlib
import logging
import os
import platform
import re
import sys
from encodings import aliases, normalize_encoding
from itertools import chain
from types import ModuleType
from typing import BinaryIO, Generator, Literal, NamedTuple, cast
from .comment import _all_style_classes
from .copyright import (
COPYRIGHT_NOTICE_PATTERN,
CopyrightNotice,
ReuseInfo,
SpdxExpression,
)
from .exceptions import NoEncodingModuleError
from .i18n import _
_LOGGER = logging.getLogger(__name__)
_EncodingModulesLiteral = Literal["magic", "charset_normalizer", "chardet"]
_ENCODING_MODULES = list(_EncodingModulesLiteral.__args__) # type: ignore
if _env_encoding_module := os.environ.get("REUSE_ENCODING_MODULE"):
if _env_encoding_module not in _ENCODING_MODULES:
print(
# TRANSLATORS: Do not translate REUSE_ENCODING_MODULE.
_(
"REUSE_ENCODING_MODULE must have a value in {modules}; it has"
" '{env_module}'. Aborting."
).format(modules=_ENCODING_MODULES, env_module=_env_encoding_module)
)
sys.exit(1)
_ENCODING_MODULES = [_env_encoding_module]
_ENCODING_MODULE: ModuleType | None = None
_MAGIC = None
for _module in _ENCODING_MODULES:
if _module == "magic" and platform.system() == "Windows":
continue
try:
_ENCODING_MODULE = importlib.import_module(_module)
break
except ImportError:
continue
else:
raise NoEncodingModuleError(
_(
"No supported module that can detect the encoding of files could be"
" successfully imported. Re-read the installation instructions for"
" the reuse package, or try the following:"
)
+ "\n\n"
+ _(
"- If you are running a Linux distribution, try your equivalent of"
" `apt install file` or `dnf install file`."
)
+ "\n"
+ _(
"- Run ` pipx install reuse[charset-normalizer]`. Replace 'pipx'"
" with 'pip' if you are not using pipx."
)
)
[docs]
def get_encoding_module() -> ModuleType:
"""Get the module used to detect the encodings of files."""
return cast(ModuleType, _ENCODING_MODULE)
def _get_encoding_module_name() -> str | None:
return getattr(get_encoding_module(), "__name__", None)
[docs]
def set_encoding_module(name: _EncodingModulesLiteral) -> ModuleType:
"""Set the module used to detect the encodings of files, and return the
module.
"""
if name not in _ENCODING_MODULES:
raise NoEncodingModuleError(f"'{name}' is not a valid encoding module.")
try:
# pylint: disable=global-statement
global _ENCODING_MODULE
_ENCODING_MODULE = importlib.import_module(name)
return _ENCODING_MODULE
except ImportError as err:
raise NoEncodingModuleError(f"'{name}' could not be imported.") from err
if get_encoding_module().__name__ == "magic":
_MAGIC = get_encoding_module().Magic(mime_encoding=True)
REUSE_IGNORE_START = "REUSE-IgnoreStart"
REUSE_IGNORE_END = "REUSE-IgnoreEnd"
# REUSE-IgnoreStart
SPDX_SNIPPET_INDICATOR = b"SPDX-SnippetBegin"
_START_PATTERN = r"(?:^.*?)"
_END_PATTERN = r"\s*(?:{})*\s*$".format(
"|".join(
set(
chain(
(
re.escape(style.MULTI_LINE.end)
for style in _all_style_classes()
if style.MULTI_LINE.end
),
# These are special endings which do not belong to specific
# comment styles, but which we want to nonetheless strip away
# while parsing.
(
# ex: <tag value="Copyright Jane Doe">
r'"\s*/*>',
r"'\s*/*>",
# ex: [SPDX-License-Identifier: GPL-3.0-or-later] ::
r"\]\s*::",
),
)
)
)
)
_ALL_MATCH_PATTERN = re.compile(
r"^(?P<prefix>.*?)(?:SPDX-\S+:|Copyright|©).*$",
re.MULTILINE,
)
_COPYRIGHT_NOTICE_PATTERN = re.compile(
_START_PATTERN + COPYRIGHT_NOTICE_PATTERN.pattern + _END_PATTERN,
)
_LICENSE_IDENTIFIER_PATTERN = re.compile(
_START_PATTERN
+ r"SPDX-License-Identifier:\s*(?P<value>.*?)"
+ _END_PATTERN,
)
_CONTRIBUTOR_PATTERN = re.compile(
_START_PATTERN + r"SPDX-FileContributor:\s*(?P<value>.*?)" + _END_PATTERN,
)
# The keys match the relevant attributes of ReuseInfo.
_SPDX_TAGS: dict[str, re.Pattern] = {
"spdx_expressions": _LICENSE_IDENTIFIER_PATTERN,
"contributor_lines": _CONTRIBUTOR_PATTERN,
}
_LICENSEREF_PATTERN = re.compile(r"LicenseRef-[a-zA-Z0-9-.]+$")
_NEWLINE_PATTERN = re.compile(r"\r\n?")
_LINE_ENDINGS = ("\r\n", "\r", "\n")
_LINE_ENDINGS_ASCII = tuple(ending.encode("ascii") for ending in _LINE_ENDINGS)
_LINE_ENDINGS_UTF_16_LE = tuple(
ending.encode("utf_16_le") for ending in _LINE_ENDINGS
)
_LINE_ENDINGS_UTF_32_LE = tuple(
ending.encode("utf_32_le") for ending in _LINE_ENDINGS
)
_LINE_ENDING_ENCODINGS_ASCII = set()
for _name in set(chain.from_iterable(aliases.aliases.items())):
with contextlib.suppress(Exception):
if codecs.encode("\r\n", _name) == b"\r\n":
_LINE_ENDING_ENCODINGS_ASCII.add(_name)
_LINE_ENDING_ENCODINGS_ASCII.add("utf_8_sig")
_LINE_ENDING_ENCODINGS_UTF_16_LE = {
"u16",
"utf16",
"utf_16",
"unicodelittleunmarked",
"utf_16le",
"utf_16_le",
}
_LINE_ENDING_ENCODINGS_UTF_32_LE = {
"u32",
"utf32",
"utf_32",
"utf_32le",
"utf_32_le",
}
#: Default chunk size for reading files.
CHUNK_SIZE = 1024 * 64
#: Default line size for reading files.
LINE_SIZE = 1024
#: Default chunk size used to heuristically detect file type, encoding, et
#: cetera.
HEURISTICS_CHUNK_SIZE = 1024 * 2
[docs]
class FilterBlock(NamedTuple):
"""A simple tuple that holds a block of text, and whether that block of text
is in an ignore block.
"""
text: str
in_ignore_block: bool
[docs]
def filter_ignore_block(
text: str, in_ignore_block: bool = False
) -> FilterBlock:
"""Filter out blocks beginning with REUSE_IGNORE_START and ending with
REUSE_IGNORE_END to remove lines that should not be treated as copyright and
licensing information.
Args:
text: The text out of which the ignore blocks must be filtered.
in_ignore_block: Whether the text is already in an ignore block. This is
useful when you parse subsequent chunks of text, and one chunk does
not close the ignore block.
Returns:
A :class:`FilterBlock` tuple that contains the filtered text and a
boolean that signals whether the ignore block is still open.
"""
ignore_start: int | None = None if not in_ignore_block else 0
ignore_end: int | None = None
if REUSE_IGNORE_START in text:
ignore_start = text.index(REUSE_IGNORE_START)
if REUSE_IGNORE_END in text:
ignore_end = text.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
if ignore_start is None:
return FilterBlock(text, False)
if ignore_end is None:
return FilterBlock(text[:ignore_start], True)
if ignore_start < ignore_end:
text_before_block = text[:ignore_start]
text_after_block, in_ignore_block = filter_ignore_block(
text[ignore_end:], False
)
return FilterBlock(
text_before_block + text_after_block, in_ignore_block
)
rest = text[ignore_start + len(REUSE_IGNORE_START) :]
if REUSE_IGNORE_END in rest:
ignore_end = rest.index(REUSE_IGNORE_END) + len(REUSE_IGNORE_END)
text_before_block = text[:ignore_start]
text_after_block, in_ignore_block = filter_ignore_block(
rest[ignore_end:]
)
return FilterBlock(
text_before_block + text_after_block, in_ignore_block
)
return FilterBlock(text[:ignore_start], True)
def _read_chunks(
fp: BinaryIO,
chunk_size: int = CHUNK_SIZE,
line_size: int = LINE_SIZE,
newline: bytes = b"\n",
) -> Generator[bytes, None, None]:
"""Read and yield somewhat equal-sized chunks from (realistically) a file.
The chunks always split at a newline where possible.
An amount of bytes equal to *chunk_size* is always read into the chunk if
*fp* contains that many bytes. An additional *line_size* or lesser amount of
bytes is also read into the chunk, up to the next newline character.
*newline* is the line separator that is (expected to be) used in the input.
"""
newline_len = len(newline)
while True:
chunk = fp.read(chunk_size)
if not chunk:
break
end_chunk_pos = fp.tell()
remainder = fp.read(line_size)
newline_idx = remainder.find(newline)
if newline_idx != -1:
remainder = remainder[: newline_idx + newline_len]
fp.seek(end_chunk_pos + newline_idx + newline_len)
chunk += remainder
yield chunk
def _detect_encoding_magic(chunk: bytes) -> str | None:
result: str = _MAGIC.from_buffer(chunk) # type: ignore[union-attr]
if result == "binary":
return None
if result == "utf-8" and chunk[:3] == b"\xef\xbb\xbf":
result += "-sig"
# Python and magic disagree on what 'le' means. For magic, it means a UTF-16
# block prefixed with a BOM. For Python, that's what 'utf-16' is, and
# 'utf_16_le' is a little endian block _without_ BOM prefix.
elif result == "utf-16le" and chunk[:2] == b"\xff\xfe":
result = "utf-16"
elif result == "utf-32le" and chunk[:4] == b"\xff\xfe\x00\x00":
result = "utf-32"
else:
# This nifty function gets the (in Python) standardised name for an
# encoding. 'iso-8859-1' becomes 'iso8859-1'.
try:
codec_info = codecs.lookup(result)
result = codec_info.name
except LookupError:
# Fallback.
result = "utf-8"
return normalize_encoding(result)
def _detect_encoding_charset_normalizer(chunk: bytes) -> str | None:
matches = get_encoding_module().from_bytes( # type: ignore[union-attr]
chunk,
)
best = matches.best()
if best is not None:
result: str = best.encoding
if result == "utf_8" and best.bom:
result += "_sig"
return result
return None
def _detect_encoding_chardet(chunk: bytes) -> str | None:
dict_ = get_encoding_module().detect(chunk) # type: ignore[union-attr]
result: str | None = dict_.get("encoding")
if result is None:
return None
try:
codec_info = codecs.lookup(result)
result = codec_info.name
except LookupError:
# Fallback.
result = "utf-8"
return normalize_encoding(result)
[docs]
def detect_encoding(chunk: bytes) -> str | None:
"""Find the encoding of the bytes chunk, and return it as normalised name.
See :func:`encodings.normalize_encoding`. If no encoding could be found,
return :const:`None`.
If the chunk is empty or the encoding of the chunk is ASCII, ``'utf_8'`` is
returned.
"""
# If the file is empty, assume UTF-8.
if not chunk:
return "utf_8"
result: str | None = None
if _get_encoding_module_name() == "magic":
result = _detect_encoding_magic(chunk)
elif _get_encoding_module_name() == "charset_normalizer":
result = _detect_encoding_charset_normalizer(chunk)
elif _get_encoding_module_name() == "chardet":
result = _detect_encoding_chardet(chunk)
else:
# This code should technically never be reached.
raise NoEncodingModuleError()
if result in ["ascii", "us_ascii"]:
result = "utf_8"
return result
[docs]
def detect_newline(chunk: bytes, encoding: str = "ascii") -> str:
"""Return one of ``'\\n'``, ``'\\r'`` or ``'\\r\\n'`` depending on the line
endings used in *chunk*. Return :const:`os.linesep` if there are no line
endings.
"""
line_endings: tuple[bytes, ...] | None = None
encoding = normalize_encoding(encoding.lower())
# This step is part optimalisation, part dealing with BOMs.
if encoding in _LINE_ENDING_ENCODINGS_ASCII:
line_endings = _LINE_ENDINGS_ASCII
elif encoding in _LINE_ENDING_ENCODINGS_UTF_16_LE:
line_endings = _LINE_ENDINGS_UTF_16_LE
elif encoding in _LINE_ENDING_ENCODINGS_UTF_32_LE:
line_endings = _LINE_ENDINGS_UTF_32_LE
if line_endings is not None:
for line_ending_bytes in line_endings:
if line_ending_bytes in chunk:
return line_ending_bytes.decode(encoding)
else:
for line_ending_str in _LINE_ENDINGS:
if line_ending_str.encode(encoding) in chunk:
return line_ending_str
return os.linesep
[docs]
def reuse_info_of_file(
fp: BinaryIO,
chunk_size: int = CHUNK_SIZE,
line_size: int = LINE_SIZE,
) -> ReuseInfo:
"""Read from *fp* to extract REUSE information. It is read in chunks of
*chunk_size*, additionally reading up to *line_size* until the next newline.
This function decodes the binary data into UTF-8 and removes REUSE ignore
blocks before attempting to extract the REUSE information.
"""
position = fp.tell()
heuristics_chunk = fp.read(HEURISTICS_CHUNK_SIZE)
fp.seek(position) # Reset position.
encoding = detect_encoding(heuristics_chunk)
filename = getattr(fp, "name", None)
if encoding is None:
if filename:
_LOGGER.info(
_(
"'{path}' was detected as a binary file; not searching its"
" contents for REUSE information."
).format(path=filename)
)
return ReuseInfo()
newline = detect_newline(heuristics_chunk, encoding=encoding)
if filename:
_LOGGER.debug(
_(
"extracting REUSE information from '{path}'"
" (encoding {encoding}, encoding module {module},"
" newline {newline})"
).format(
path=filename,
encoding=repr(encoding),
module=repr(_get_encoding_module_name()),
newline=repr(newline),
)
)
in_ignore_block = False
reuse_infos: list[ReuseInfo] = []
for chunk in _read_chunks(
fp,
chunk_size=chunk_size,
line_size=line_size,
newline=newline.encode(encoding),
):
text = chunk.decode(encoding, errors="replace")
text = _NEWLINE_PATTERN.sub("\n", text)
text, in_ignore_block = filter_ignore_block(text, in_ignore_block)
reuse_infos.append(extract_reuse_info(text))
return ReuseInfo().union(*reuse_infos)
[docs]
def contains_reuse_info(text: str) -> bool:
"""The text contains REUSE info."""
return bool(extract_reuse_info(filter_ignore_block(text).text))
# REUSE-IgnoreEnd