# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
# SPDX-FileCopyrightText: 2024 Kerry McAdams <github@klmcadams>
# SPDX-FileCopyrightText: 2024 Sebastien Morais <github@SMoraisAnsys>
# SPDX-FileCopyrightText: 2025 Simon Barth <simon.barth@gmx.de>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Module that contains reports about files and projects for linting."""
import bdb
import contextlib
import datetime
import logging
import random
from collections import defaultdict
from collections.abc import Collection, Generator
from concurrent.futures import ProcessPoolExecutor
from functools import cached_property
from hashlib import md5
from io import StringIO
from os import cpu_count
from pathlib import Path, PurePath
from typing import Any, Final, NamedTuple, Optional, Protocol, cast
from uuid import uuid4
from . import __REUSE_version__, __version__
from ._util import (
_add_plus_to_identifier,
_checksum,
_strip_plus_from_identifier,
)
from .copyright import SpdxExpression
from .extract import _LICENSEREF_PATTERN
from .global_licensing import ReuseDep5
from .i18n import _
from .project import Project, ReuseInfo
from .types import StrPath
_LOGGER = logging.getLogger(__name__)
LINT_VERSION = "1.0"
_CPU_COUNT: Final[int] = cpu_count() or 1
#: This variable exists to be able to override parallelisation. If set to
#: :const:`False`, generating :meth:`FileReport.generate` will not use
#: parallelisation.
ENABLE_PARALLEL = True
# REUSE-IgnoreStart
class _MultiprocessingContainer:
"""Container that remembers some data in order to generate a FileReport."""
def __init__(
self, project: Project, do_checksum: bool, add_license_concluded: bool
):
if isinstance(project.global_licensing, ReuseDep5):
# Remember that a dep5_copyright was (or was not) set prior.
self.has_dep5 = bool(project.global_licensing)
# TODO: We create a copy of the project in the following
# song-and-dance because the debian Copyright object cannot be
# pickled.
new_project = Project(
project.root,
vcs_strategy=project.vcs_strategy,
license_map=project.license_map,
licenses=project.licenses.copy(),
# TODO: adjust this method/class to account for REUSE.toml as
# well. Unset dep5_copyright
global_licensing=None,
include_submodules=project.include_submodules,
include_meson_subprojects=project.include_meson_subprojects,
)
new_project.licenses_without_extension = (
project.licenses_without_extension
)
self.project = new_project
else:
self.has_dep5 = False
self.project = project
self.reuse_dep5: ReuseDep5 | None = None
self.do_checksum = do_checksum
self.add_license_concluded = add_license_concluded
def __call__(self, file_: StrPath) -> "_MultiprocessingResult":
# By remembering that we've parsed the .reuse/dep5, we only parse it
# once (the first time) inside of each process.
if self.has_dep5 and not self.reuse_dep5:
with contextlib.suppress(Exception):
self.reuse_dep5 = ReuseDep5.from_file(
self.project.root / ".reuse/dep5"
)
self.project.global_licensing = self.reuse_dep5
# pylint: disable=broad-except
try:
return _MultiprocessingResult(
file_,
FileReport.generate(
self.project,
file_,
do_checksum=self.do_checksum,
add_license_concluded=self.add_license_concluded,
),
None,
)
except Exception as exc:
return _MultiprocessingResult(file_, None, exc)
class _MultiprocessingResult(NamedTuple):
"""Result of :class:`MultiprocessingContainer`."""
path: StrPath
report: Optional["FileReport"]
error: Exception | None
def _generate_file_reports(
project: Project,
do_checksum: bool = True,
subset_files: Collection[StrPath] | None = None,
multiprocessing: bool = _CPU_COUNT > 1,
add_license_concluded: bool = False,
) -> Generator[_MultiprocessingResult, None, None]:
"""Create a :class:`FileReport` for every file in the project, filtered
by *subset_files*.
"""
container = _MultiprocessingContainer(
project, do_checksum, add_license_concluded
)
files = (
project.subset_files(subset_files)
if subset_files is not None
else project.all_files()
)
if multiprocessing and ENABLE_PARALLEL:
files_set = frozenset(files)
with ProcessPoolExecutor() as executor:
yield from executor.map(
container,
files_set,
chunksize=max(1, int(len(files_set) / _CPU_COUNT / 4)),
)
else:
yield from map(container, files)
def _process_error(error: Exception, path: StrPath) -> None:
# Facilitate better debugging by being able to quit the program.
if isinstance(error, (bdb.BdbQuit, KeyboardInterrupt)):
raise error
if isinstance(error, (OSError, UnicodeError)):
_LOGGER.error(
_("Could not read '{path}'").format(path=path),
exc_info=error,
)
else:
_LOGGER.error(
_("Unexpected error occurred while parsing '{path}'").format(
path=path
),
exc_info=error,
)
[docs]
class ProjectReportSubsetProtocol(Protocol):
"""A :class:`Protocol` that defines a subset of functionality of
:class:`ProjectReport`, implemented by :class:`ProjectSubsetReport`.
"""
path: StrPath
read_errors: set[Path]
file_reports: set["FileReport"]
@property
def missing_licenses(self) -> dict[str, set[Path]]:
"""Files which refer to a license which do not exist in the LICENSES/
directory.
"""
@property
def invalid_spdx_expressions(self) -> dict[Path, set[str]]:
"""Invalid expressions by file."""
@property
def files_without_licenses(self) -> set[Path]:
"""Set of paths that have no licensing information."""
@property
def files_without_copyright(self) -> set[Path]:
"""Set of paths that have no copyright information."""
@property
def is_compliant(self) -> bool:
"""Whether the report subset is compliant with the REUSE Spec."""
[docs]
class ProjectReport:
"""Object that holds linting report about the project."""
def __init__(self, do_checksum: bool = True):
self.path: StrPath = ""
self.licenses: dict[str, Path] = {}
self.read_errors: set[Path] = set()
self.file_reports: set[FileReport] = set()
self.licenses_without_extension: dict[str, Path] = {}
self.do_checksum = do_checksum
self._license_map: dict[str, dict] = {}
[docs]
def to_dict_lint(self) -> dict[str, Any]:
"""Collects and formats data relevant to linting from report and returns
it as a dictionary.
Returns:
Dictionary containing data from the ProjectReport object.
"""
# Setup report data container
data: dict[str, Any] = {
"non_compliant": {
"bad_licenses": list(sorted(self.bad_licenses)),
"deprecated_licenses": list(
sorted(str(file) for file in self.deprecated_licenses)
),
"licenses_without_extension": list(
sorted(self.licenses_without_extension)
),
"missing_licenses": list(sorted(self.missing_licenses)),
"unused_licenses": list(
sorted(str(file) for file in self.unused_licenses)
),
"read_errors": list(
sorted(str(file) for file in self.read_errors)
),
"missing_copyright_info": list(
sorted(str(file) for file in self.files_without_copyright)
),
"missing_licensing_info": list(
sorted(str(file) for file in self.files_without_licenses)
),
},
"files": [],
"summary": {
"used_licenses": [],
},
"recommendations": self.recommendations,
}
# Populate 'files'
for file_report in self.file_reports:
data["files"].append(file_report.to_dict_lint())
# Populate 'summary'
number_of_files = len(self.file_reports)
data["summary"] = {
"used_licenses": list(sorted(self.used_licenses)),
"files_total": number_of_files,
"files_with_copyright_info": number_of_files
- len(self.files_without_copyright),
"files_with_licensing_info": number_of_files
- len(self.files_without_licenses),
"compliant": self.is_compliant,
}
# Add the top three keys
unsorted_data = {
"lint_version": LINT_VERSION,
"reuse_spec_version": __REUSE_version__,
"reuse_tool_version": __version__,
**data,
}
# Sort dictionary keys while keeping the top three keys at the beginning
# and the recommendations on the bottom
sorted_keys = sorted(list(unsorted_data.keys()))
sorted_keys.remove("lint_version")
sorted_keys.remove("reuse_spec_version")
sorted_keys.remove("reuse_tool_version")
sorted_keys.remove("recommendations")
sorted_keys = (
[
"lint_version",
"reuse_spec_version",
"reuse_tool_version",
]
+ sorted_keys
+ ["recommendations"]
)
sorted_data = {key: unsorted_data[key] for key in sorted_keys}
return sorted_data
[docs]
def bill_of_materials(
self,
creator_person: str | None = None,
creator_organization: str | None = None,
) -> str:
"""Generate a bill of materials from the project.
See https://spdx.org/specifications.
"""
out = StringIO()
# Write mandatory tags
out.write("SPDXVersion: SPDX-2.1\n")
out.write("DataLicense: CC0-1.0\n")
out.write("SPDXID: SPDXRef-DOCUMENT\n")
out.write(f"DocumentName: {Path(self.path).resolve().name}\n")
# TODO: Generate UUID from git revision maybe
# TODO: Fix the URL
out.write(
f"DocumentNamespace: http://spdx.org/spdxdocs/spdx-v2.1-{uuid4()}\n"
)
# Author
out.write(f"Creator: Person: {format_creator(creator_person)}\n")
out.write(
f"Creator: Organization: {format_creator(creator_organization)}\n"
)
out.write(f"Creator: Tool: reuse-{__version__}\n")
now = datetime.datetime.now(tz=datetime.timezone.utc)
out.write(f"Created: {now.strftime('%Y-%m-%dT%H:%M:%SZ')}\n")
out.write(
"CreatorComment: <text>This document was created automatically"
" using available reuse information consistent with"
" REUSE.</text>\n"
)
reports = sorted(self.file_reports, key=lambda x: x.name)
for report in reports:
out.write(
"Relationship: SPDXRef-DOCUMENT DESCRIBES"
f" {report.spdx_id}\n"
)
for report in reports:
out.write("\n")
out.write(f"FileName: {report.name}\n")
out.write(f"SPDXID: {report.spdx_id}\n")
out.write(f"FileChecksum: SHA1: {report.chk_sum}\n")
out.write(f"LicenseConcluded: {report.license_concluded}\n")
for lic in sorted(report.licenses_in_file):
out.write(f"LicenseInfoInFile: {lic}\n")
if report.copyright:
out.write(
"FileCopyrightText:" f" <text>{report.copyright}</text>\n"
)
else:
out.write("FileCopyrightText: NONE\n")
# Licenses
for lic, path in sorted(self.licenses.items()):
if _LICENSEREF_PATTERN.match(lic):
out.write("\n")
out.write(f"LicenseID: {lic}\n")
out.write("LicenseName: NOASSERTION\n")
with (Path(self.path) / path).open(encoding="utf-8") as fp:
out.write(f"ExtractedText: <text>{fp.read()}</text>\n")
return out.getvalue()
[docs]
@classmethod
def generate(
cls,
project: Project,
do_checksum: bool = True,
multiprocessing: bool = _CPU_COUNT > 1,
add_license_concluded: bool = False,
) -> "ProjectReport":
"""Generate a :class:`ProjectReport` from a :class:`Project`.
Args:
project: The :class:`Project` to lint.
do_checksum: Generate a checksum of every file. If this is
:const:`False`, generate a random checksum for every file.
multiprocessing: Whether to use multiprocessing.
add_license_concluded: Whether to aggregate all found SPDX
expressions into a concluded license.
"""
project_report = cls(do_checksum=do_checksum)
project_report.path = project.root
project_report.licenses = project.licenses
project_report._license_map = project.license_map
project_report.licenses_without_extension = (
project.licenses_without_extension
)
results = _generate_file_reports(
project,
do_checksum=do_checksum,
multiprocessing=multiprocessing,
add_license_concluded=add_license_concluded,
)
for result in results:
if result.error:
_process_error(result.error, result.path)
project_report.read_errors.add(Path(result.path))
continue
file_report = cast(FileReport, result.report)
project_report.file_reports.add(file_report)
return project_report
@cached_property
def used_licenses(self) -> set[str]:
"""Set of license identifiers that are found in file reports."""
return {
lic
for file_report in self.file_reports
for lic in file_report.licenses_in_file
}
@cached_property
def bad_licenses(self) -> dict[str, Path]:
"""Licenses in LICENSES/ which are not valid SPDX licenses."""
return {
lic: path
for lic, path in self.licenses.items()
if lic not in self._license_map
}
@cached_property
def deprecated_licenses(self) -> set[str]:
"""Licenses whose SPDX License identifier has been deprecated."""
return {
lic
for lic in self.licenses
if lic in self._license_map
and self._license_map[lic]["isDeprecatedLicenseId"]
}
@cached_property
def unused_licenses(self) -> set[str]:
"""Set of license identifiers that are not found in any file report."""
return {
lic
for lic in self.licenses
if not any(
identifier in self.used_licenses
for identifier in (lic, _add_plus_to_identifier(lic))
)
}
@cached_property
def missing_licenses(self) -> dict[str, set[Path]]:
"""Files which refer to a license which do not exist in the LICENSES/
directory.
"""
result = defaultdict(set)
for file_report in self.file_reports:
for missing_license in file_report.missing_licenses:
result[missing_license].add(file_report.path)
return result
@cached_property
def invalid_spdx_expressions(self) -> dict[Path, set[str]]:
"""Invalid expressions by file."""
return {
file_report.path: file_report.invalid_spdx_expressions
for file_report in self.file_reports
if file_report.invalid_spdx_expressions
}
@cached_property
def files_without_licenses(self) -> set[Path]:
"""Set of paths that have no licensing information."""
return {
file_report.path
for file_report in self.file_reports
if not file_report.licenses_in_file
}
@cached_property
def files_without_copyright(self) -> set[Path]:
"""Set of paths that have no copyright information."""
return {
file_report.path
for file_report in self.file_reports
if not file_report.copyright
}
@cached_property
def is_compliant(self) -> bool:
"""Whether the report is compliant with the REUSE Spec."""
return not any(
(
self.missing_licenses,
self.unused_licenses,
self.bad_licenses,
self.deprecated_licenses,
self.licenses_without_extension,
self.read_errors,
self.invalid_spdx_expressions,
self.files_without_copyright,
self.files_without_licenses,
)
)
@property
def recommendations(self) -> list[str]:
"""Generate help for next steps based on found REUSE issues"""
recommendations = []
# These items should be ordered in the same way as in the summary.
if self.bad_licenses:
recommendations.append(
_(
"Fix bad licenses: At least one license in the LICENSES"
" directory and/or provided by 'SPDX-License-Identifier'"
" tags is invalid. They are either not valid SPDX License"
" Identifiers or do not start with 'LicenseRef-'. FAQ about"
" custom licenses:"
" https://reuse.software/faq/#custom-license"
)
)
if self.deprecated_licenses:
recommendations.append(
_(
"Fix deprecated licenses: At least one of the licenses in"
" the LICENSES directory and/or provided by an"
" 'SPDX-License-Identifier' tag or in '.reuse/dep5' has"
" been deprecated by SPDX. The current list and their"
" respective recommended new identifiers can be found"
" here: <https://spdx.org/licenses/#deprecated>"
)
)
if self.licenses_without_extension:
recommendations.append(
_(
"Fix licenses without file extension: At least one license"
" text file in the 'LICENSES' directory does not have a"
" '.txt' file extension. Please rename the file(s)"
" accordingly."
)
)
if self.missing_licenses:
recommendations.append(
_(
"Fix missing licenses: For at least one of the license"
" identifiers provided by the 'SPDX-License-Identifier'"
" tags, there is no corresponding license text file in the"
" 'LICENSES' directory. For SPDX license identifiers, you"
" can simply run 'reuse download --all' to get any missing"
" ones. For custom licenses (starting with 'LicenseRef-'),"
" you need to add these files yourself."
)
)
if self.unused_licenses:
recommendations.append(
_(
"Fix unused licenses: At least one of the license text"
" files in 'LICENSES' is not referenced by any file, e.g."
" by an 'SPDX-License-Identifier' tag. Please make sure"
" that you either tag the accordingly licensed files"
" properly, or delete the unused license text if you are"
" sure that no file or code snippet is licensed as such."
)
)
if self.read_errors:
recommendations.append(
_(
"Fix read errors: At least one of the files in your"
" directory cannot be read by the tool. Please check the"
" file permissions. You will find the affected files at the"
" top of the output as part of the logged error messages."
)
)
if self.invalid_spdx_expressions:
recommendations.append(
_(
"Fix invalid SPDX License Expressions: In one or more files"
" there are SPDX License Expressions which cannot be"
" parse. Check whether the value that follows"
" 'SPDX-License-Identifier:' is correct. If the detected"
" expression is not meant to be valid, put it between"
" 'REUSE-IgnoreStart' and 'REUSE-IgnoreEnd' comments."
)
)
if self.files_without_copyright or self.files_without_licenses:
recommendations.append(
_(
"Fix missing copyright/licensing information: For one or"
" more files, the tool cannot find copyright and/or"
" licensing information. You typically do this by adding"
" 'SPDX-FileCopyrightText' and 'SPDX-License-Identifier'"
" tags to each file. The tutorial explains additional ways"
" to do this: <https://reuse.software/tutorial/>"
)
)
return recommendations
[docs]
class ProjectSubsetReport:
"""Like a :class:`ProjectReport`, but for a subset of the files using a
subset of features.
"""
def __init__(self) -> None:
self.path: StrPath = ""
self.read_errors: set[Path] = set()
self.file_reports: set[FileReport] = set()
[docs]
@classmethod
def generate(
cls,
project: Project,
subset_files: Collection[StrPath],
multiprocessing: bool = _CPU_COUNT > 1,
) -> "ProjectSubsetReport":
"""Generate a :class:`ProjectSubsetReport` from a :class:`Project`.
Args:
project: The :class:`Project` to lint.
subset_files: Only lint the files in this list.
multiprocessing: Whether to use multiprocessing.
"""
subset_report = cls()
subset_report.path = project.root
results = _generate_file_reports(
project,
do_checksum=False,
subset_files=subset_files,
multiprocessing=multiprocessing,
add_license_concluded=False,
)
for result in results:
if result.error:
_process_error(result.error, result.path)
subset_report.read_errors.add(Path(result.path))
continue
file_report = cast(FileReport, result.report)
subset_report.file_reports.add(file_report)
return subset_report
@property
def missing_licenses(self) -> dict[str, set[Path]]:
"""Files which refer to a license which do not exist in the LICENSES/
directory.
"""
result = defaultdict(set)
for file_report in self.file_reports:
for missing_license in file_report.missing_licenses:
result[missing_license].add(file_report.path)
return result
@property
def invalid_spdx_expressions(self) -> dict[Path, set[str]]:
"""Invalid expressions by file."""
return {
file_report.path: file_report.invalid_spdx_expressions
for file_report in self.file_reports
if file_report.invalid_spdx_expressions
}
@property
def files_without_licenses(self) -> set[Path]:
"""Set of paths that have no licensing information."""
return {
file_report.path
for file_report in self.file_reports
if not file_report.licenses_in_file
}
@property
def files_without_copyright(self) -> set[Path]:
"""Set of paths that have no copyright information."""
return {
file_report.path
for file_report in self.file_reports
if not file_report.copyright
}
@property
def is_compliant(self) -> bool:
"""Whether the report subset is compliant with the REUSE Spec."""
return not any(
(
self.missing_licenses,
self.files_without_copyright,
self.files_without_licenses,
self.read_errors,
)
)
[docs]
class FileReport: # pylint: disable=too-many-instance-attributes
"""Object that holds a linting report about a single file."""
def __init__(self, name: str, path: StrPath, do_checksum: bool = True):
self.name = name
self.path = Path(path)
self.do_checksum = do_checksum
self.reuse_infos: list[ReuseInfo] = []
self.spdx_id: str | None = None
self.chk_sum: str | None = None
self.licenses_in_file: list[str] = []
self.license_concluded: str = ""
self.copyright: str = ""
self.missing_licenses: set[str] = set()
self.invalid_spdx_expressions: set[str] = set()
[docs]
def to_dict_lint(self) -> dict[str, Any]:
"""Turn the report into a json-like dictionary with exclusively
information relevant for linting.
"""
return {
"path": PurePath(self.name).as_posix(),
"copyrights": [
{
"value": str(line),
"source": reuse_info.source_path,
"source_type": (
reuse_info.source_type.value
if reuse_info.source_type
else None
),
}
for reuse_info in self.reuse_infos
for line in reuse_info.copyright_notices
],
"spdx_expressions": [
{
"value": str(expression),
"is_valid": expression.is_valid,
"source": reuse_info.source_path,
"source_type": (
reuse_info.source_type.value
if reuse_info.source_type
else None
),
}
for reuse_info in self.reuse_infos
for expression in reuse_info.spdx_expressions
],
}
[docs]
@classmethod
def generate(
cls,
project: Project,
path: StrPath,
do_checksum: bool = True,
add_license_concluded: bool = False,
) -> "FileReport":
"""Generate a FileReport from a path in a Project."""
# pylint: disable=too-many-branches
path = Path(path)
if not path.is_file():
raise OSError(f"{path} is not a file")
relative = project.relative_from_root(path)
report = cls(f"./{relative}", path, do_checksum=do_checksum)
# Checksum and ID
if report.do_checksum:
report.chk_sum = _checksum(path)
else:
# This path avoids a lot of heavy computation, which is handy for
# scenarios where you only need a unique hash, not a consistent
# hash.
report.chk_sum = f"{random.getrandbits(160):040x}"
spdx_id = md5()
spdx_id.update(report.name.encode("utf-8"))
spdx_id.update(report.chk_sum.encode("utf-8"))
report.spdx_id = f"SPDXRef-{spdx_id.hexdigest()}"
reuse_infos = project.reuse_info_of(path)
for reuse_info in reuse_infos:
for expression in reuse_info.spdx_expressions:
if not expression.is_valid:
report.invalid_spdx_expressions.add(str(expression))
for identifier in expression.licenses:
# A license expression akin to Apache-1.0+ should register
# correctly if LICENSES/Apache-1.0.txt exists.
identifiers = {identifier}
if (
plus_identifier := _strip_plus_from_identifier(
identifier
)
) != identifier:
identifiers.add(plus_identifier)
# Missing license
if not identifiers.intersection(project.licenses):
report.missing_licenses.add(identifier)
# Add license to report.
report.licenses_in_file.append(identifier)
if not add_license_concluded:
report.license_concluded = "NOASSERTION"
elif not any(reuse_info.spdx_expressions for reuse_info in reuse_infos):
report.license_concluded = "NONE"
elif report.invalid_spdx_expressions:
report.license_concluded = "NOASSERTION"
else:
# Merge all the license expressions together, wrapping them in
# parentheses to make sure an expression doesn't spill into another
# one. The extra parentheses will be removed by the roundtrip
# through parse() -> simplify() -> render().
report.license_concluded = str(
SpdxExpression.combine(
list(
expression
for reuse_info in reuse_infos
for expression in reuse_info.spdx_expressions
)
).simplify()
)
# Copyright text
report.copyright = "\n".join(
map(
str,
sorted(
line
for reuse_info in reuse_infos
for line in reuse_info.copyright_notices
),
)
)
# Source of licensing and copyright info
report.reuse_infos = reuse_infos
return report
def __hash__(self) -> int:
if self.chk_sum is not None:
return hash(self.name + self.chk_sum)
return super().__hash__()
# REUSE-IgnoreEnd