Source code for reuse.report

# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
# SPDX-FileCopyrightText: 2024 Kerry McAdams <github@klmcadams>
# SPDX-FileCopyrightText: 2024 Sebastien Morais <github@SMoraisAnsys>
# SPDX-FileCopyrightText: 2025 Simon Barth <simon.barth@gmx.de>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains reports about files and projects for linting."""

import bdb
import contextlib
import datetime
import logging
import random
from collections import defaultdict
from collections.abc import Collection, Generator
from concurrent.futures import ProcessPoolExecutor
from functools import cached_property
from hashlib import md5
from io import StringIO
from os import cpu_count
from pathlib import Path, PurePath
from typing import Any, Final, NamedTuple, Optional, Protocol, cast
from uuid import uuid4

from . import __REUSE_version__, __version__
from ._util import (
    _add_plus_to_identifier,
    _checksum,
    _strip_plus_from_identifier,
)
from .copyright import SpdxExpression
from .extract import _LICENSEREF_PATTERN
from .global_licensing import ReuseDep5
from .i18n import _
from .project import Project, ReuseInfo
from .types import StrPath

_LOGGER = logging.getLogger(__name__)

LINT_VERSION = "1.0"

_CPU_COUNT: Final[int] = cpu_count() or 1
#: This variable exists to be able to override parallelisation. If set to
#: :const:`False`, generating :meth:`FileReport.generate` will not use
#: parallelisation.
ENABLE_PARALLEL = True

# REUSE-IgnoreStart


class _MultiprocessingContainer:
    """Container that remembers some data in order to generate a FileReport."""

    def __init__(
        self, project: Project, do_checksum: bool, add_license_concluded: bool
    ):
        if isinstance(project.global_licensing, ReuseDep5):
            # Remember that a dep5_copyright was (or was not) set prior.
            self.has_dep5 = bool(project.global_licensing)
            # TODO: We create a copy of the project in the following
            # song-and-dance because the debian Copyright object cannot be
            # pickled.
            new_project = Project(
                project.root,
                vcs_strategy=project.vcs_strategy,
                license_map=project.license_map,
                licenses=project.licenses.copy(),
                # TODO: adjust this method/class to account for REUSE.toml as
                # well. Unset dep5_copyright
                global_licensing=None,
                include_submodules=project.include_submodules,
                include_meson_subprojects=project.include_meson_subprojects,
            )
            new_project.licenses_without_extension = (
                project.licenses_without_extension
            )
            self.project = new_project
        else:
            self.has_dep5 = False
            self.project = project

        self.reuse_dep5: ReuseDep5 | None = None
        self.do_checksum = do_checksum
        self.add_license_concluded = add_license_concluded

    def __call__(self, file_: StrPath) -> "_MultiprocessingResult":
        # By remembering that we've parsed the .reuse/dep5, we only parse it
        # once (the first time) inside of each process.
        if self.has_dep5 and not self.reuse_dep5:
            with contextlib.suppress(Exception):
                self.reuse_dep5 = ReuseDep5.from_file(
                    self.project.root / ".reuse/dep5"
                )
                self.project.global_licensing = self.reuse_dep5
        # pylint: disable=broad-except
        try:
            return _MultiprocessingResult(
                file_,
                FileReport.generate(
                    self.project,
                    file_,
                    do_checksum=self.do_checksum,
                    add_license_concluded=self.add_license_concluded,
                ),
                None,
            )
        except Exception as exc:
            return _MultiprocessingResult(file_, None, exc)


class _MultiprocessingResult(NamedTuple):
    """Result of :class:`MultiprocessingContainer`."""

    path: StrPath
    report: Optional["FileReport"]
    error: Exception | None


def _generate_file_reports(
    project: Project,
    do_checksum: bool = True,
    subset_files: Collection[StrPath] | None = None,
    multiprocessing: bool = _CPU_COUNT > 1,
    add_license_concluded: bool = False,
) -> Generator[_MultiprocessingResult, None, None]:
    """Create a :class:`FileReport` for every file in the project, filtered
    by *subset_files*.
    """
    container = _MultiprocessingContainer(
        project, do_checksum, add_license_concluded
    )

    files = (
        project.subset_files(subset_files)
        if subset_files is not None
        else project.all_files()
    )
    if multiprocessing and ENABLE_PARALLEL:
        files_set = frozenset(files)
        with ProcessPoolExecutor() as executor:
            yield from executor.map(
                container,
                files_set,
                chunksize=max(1, int(len(files_set) / _CPU_COUNT / 4)),
            )
    else:
        yield from map(container, files)


def _process_error(error: Exception, path: StrPath) -> None:
    # Facilitate better debugging by being able to quit the program.
    if isinstance(error, (bdb.BdbQuit, KeyboardInterrupt)):
        raise error
    if isinstance(error, (OSError, UnicodeError)):
        _LOGGER.error(
            _("Could not read '{path}'").format(path=path),
            exc_info=error,
        )
    else:
        _LOGGER.error(
            _("Unexpected error occurred while parsing '{path}'").format(
                path=path
            ),
            exc_info=error,
        )


[docs] class ProjectReportSubsetProtocol(Protocol): """A :class:`Protocol` that defines a subset of functionality of :class:`ProjectReport`, implemented by :class:`ProjectSubsetReport`. """ path: StrPath read_errors: set[Path] file_reports: set["FileReport"] @property def missing_licenses(self) -> dict[str, set[Path]]: """Files which refer to a license which do not exist in the LICENSES/ directory. """ @property def invalid_spdx_expressions(self) -> dict[Path, set[str]]: """Invalid expressions by file.""" @property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" @property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" @property def is_compliant(self) -> bool: """Whether the report subset is compliant with the REUSE Spec."""
[docs] class ProjectReport: """Object that holds linting report about the project.""" def __init__(self, do_checksum: bool = True): self.path: StrPath = "" self.licenses: dict[str, Path] = {} self.read_errors: set[Path] = set() self.file_reports: set[FileReport] = set() self.licenses_without_extension: dict[str, Path] = {} self.do_checksum = do_checksum self._license_map: dict[str, dict] = {}
[docs] def to_dict_lint(self) -> dict[str, Any]: """Collects and formats data relevant to linting from report and returns it as a dictionary. Returns: Dictionary containing data from the ProjectReport object. """ # Setup report data container data: dict[str, Any] = { "non_compliant": { "bad_licenses": list(sorted(self.bad_licenses)), "deprecated_licenses": list( sorted(str(file) for file in self.deprecated_licenses) ), "licenses_without_extension": list( sorted(self.licenses_without_extension) ), "missing_licenses": list(sorted(self.missing_licenses)), "unused_licenses": list( sorted(str(file) for file in self.unused_licenses) ), "read_errors": list( sorted(str(file) for file in self.read_errors) ), "missing_copyright_info": list( sorted(str(file) for file in self.files_without_copyright) ), "missing_licensing_info": list( sorted(str(file) for file in self.files_without_licenses) ), }, "files": [], "summary": { "used_licenses": [], }, "recommendations": self.recommendations, } # Populate 'files' for file_report in self.file_reports: data["files"].append(file_report.to_dict_lint()) # Populate 'summary' number_of_files = len(self.file_reports) data["summary"] = { "used_licenses": list(sorted(self.used_licenses)), "files_total": number_of_files, "files_with_copyright_info": number_of_files - len(self.files_without_copyright), "files_with_licensing_info": number_of_files - len(self.files_without_licenses), "compliant": self.is_compliant, } # Add the top three keys unsorted_data = { "lint_version": LINT_VERSION, "reuse_spec_version": __REUSE_version__, "reuse_tool_version": __version__, **data, } # Sort dictionary keys while keeping the top three keys at the beginning # and the recommendations on the bottom sorted_keys = sorted(list(unsorted_data.keys())) sorted_keys.remove("lint_version") sorted_keys.remove("reuse_spec_version") sorted_keys.remove("reuse_tool_version") sorted_keys.remove("recommendations") sorted_keys = ( [ "lint_version", "reuse_spec_version", "reuse_tool_version", ] + sorted_keys + ["recommendations"] ) sorted_data = {key: unsorted_data[key] for key in sorted_keys} return sorted_data
[docs] def bill_of_materials( self, creator_person: str | None = None, creator_organization: str | None = None, ) -> str: """Generate a bill of materials from the project. See https://spdx.org/specifications. """ out = StringIO() # Write mandatory tags out.write("SPDXVersion: SPDX-2.1\n") out.write("DataLicense: CC0-1.0\n") out.write("SPDXID: SPDXRef-DOCUMENT\n") out.write(f"DocumentName: {Path(self.path).resolve().name}\n") # TODO: Generate UUID from git revision maybe # TODO: Fix the URL out.write( f"DocumentNamespace: http://spdx.org/spdxdocs/spdx-v2.1-{uuid4()}\n" ) # Author out.write(f"Creator: Person: {format_creator(creator_person)}\n") out.write( f"Creator: Organization: {format_creator(creator_organization)}\n" ) out.write(f"Creator: Tool: reuse-{__version__}\n") now = datetime.datetime.now(tz=datetime.timezone.utc) out.write(f"Created: {now.strftime('%Y-%m-%dT%H:%M:%SZ')}\n") out.write( "CreatorComment: <text>This document was created automatically" " using available reuse information consistent with" " REUSE.</text>\n" ) reports = sorted(self.file_reports, key=lambda x: x.name) for report in reports: out.write( "Relationship: SPDXRef-DOCUMENT DESCRIBES" f" {report.spdx_id}\n" ) for report in reports: out.write("\n") out.write(f"FileName: {report.name}\n") out.write(f"SPDXID: {report.spdx_id}\n") out.write(f"FileChecksum: SHA1: {report.chk_sum}\n") out.write(f"LicenseConcluded: {report.license_concluded}\n") for lic in sorted(report.licenses_in_file): out.write(f"LicenseInfoInFile: {lic}\n") if report.copyright: out.write( "FileCopyrightText:" f" <text>{report.copyright}</text>\n" ) else: out.write("FileCopyrightText: NONE\n") # Licenses for lic, path in sorted(self.licenses.items()): if _LICENSEREF_PATTERN.match(lic): out.write("\n") out.write(f"LicenseID: {lic}\n") out.write("LicenseName: NOASSERTION\n") with (Path(self.path) / path).open(encoding="utf-8") as fp: out.write(f"ExtractedText: <text>{fp.read()}</text>\n") return out.getvalue()
[docs] @classmethod def generate( cls, project: Project, do_checksum: bool = True, multiprocessing: bool = _CPU_COUNT > 1, add_license_concluded: bool = False, ) -> "ProjectReport": """Generate a :class:`ProjectReport` from a :class:`Project`. Args: project: The :class:`Project` to lint. do_checksum: Generate a checksum of every file. If this is :const:`False`, generate a random checksum for every file. multiprocessing: Whether to use multiprocessing. add_license_concluded: Whether to aggregate all found SPDX expressions into a concluded license. """ project_report = cls(do_checksum=do_checksum) project_report.path = project.root project_report.licenses = project.licenses project_report._license_map = project.license_map project_report.licenses_without_extension = ( project.licenses_without_extension ) results = _generate_file_reports( project, do_checksum=do_checksum, multiprocessing=multiprocessing, add_license_concluded=add_license_concluded, ) for result in results: if result.error: _process_error(result.error, result.path) project_report.read_errors.add(Path(result.path)) continue file_report = cast(FileReport, result.report) project_report.file_reports.add(file_report) return project_report
@cached_property def used_licenses(self) -> set[str]: """Set of license identifiers that are found in file reports.""" return { lic for file_report in self.file_reports for lic in file_report.licenses_in_file } @cached_property def bad_licenses(self) -> dict[str, Path]: """Licenses in LICENSES/ which are not valid SPDX licenses.""" return { lic: path for lic, path in self.licenses.items() if lic not in self._license_map } @cached_property def deprecated_licenses(self) -> set[str]: """Licenses whose SPDX License identifier has been deprecated.""" return { lic for lic in self.licenses if lic in self._license_map and self._license_map[lic]["isDeprecatedLicenseId"] } @cached_property def unused_licenses(self) -> set[str]: """Set of license identifiers that are not found in any file report.""" return { lic for lic in self.licenses if not any( identifier in self.used_licenses for identifier in (lic, _add_plus_to_identifier(lic)) ) } @cached_property def missing_licenses(self) -> dict[str, set[Path]]: """Files which refer to a license which do not exist in the LICENSES/ directory. """ result = defaultdict(set) for file_report in self.file_reports: for missing_license in file_report.missing_licenses: result[missing_license].add(file_report.path) return result @cached_property def invalid_spdx_expressions(self) -> dict[Path, set[str]]: """Invalid expressions by file.""" return { file_report.path: file_report.invalid_spdx_expressions for file_report in self.file_reports if file_report.invalid_spdx_expressions } @cached_property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" return { file_report.path for file_report in self.file_reports if not file_report.licenses_in_file } @cached_property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" return { file_report.path for file_report in self.file_reports if not file_report.copyright } @cached_property def is_compliant(self) -> bool: """Whether the report is compliant with the REUSE Spec.""" return not any( ( self.missing_licenses, self.unused_licenses, self.bad_licenses, self.deprecated_licenses, self.licenses_without_extension, self.read_errors, self.invalid_spdx_expressions, self.files_without_copyright, self.files_without_licenses, ) ) @property def recommendations(self) -> list[str]: """Generate help for next steps based on found REUSE issues""" recommendations = [] # These items should be ordered in the same way as in the summary. if self.bad_licenses: recommendations.append( _( "Fix bad licenses: At least one license in the LICENSES" " directory and/or provided by 'SPDX-License-Identifier'" " tags is invalid. They are either not valid SPDX License" " Identifiers or do not start with 'LicenseRef-'. FAQ about" " custom licenses:" " https://reuse.software/faq/#custom-license" ) ) if self.deprecated_licenses: recommendations.append( _( "Fix deprecated licenses: At least one of the licenses in" " the LICENSES directory and/or provided by an" " 'SPDX-License-Identifier' tag or in '.reuse/dep5' has" " been deprecated by SPDX. The current list and their" " respective recommended new identifiers can be found" " here: <https://spdx.org/licenses/#deprecated>" ) ) if self.licenses_without_extension: recommendations.append( _( "Fix licenses without file extension: At least one license" " text file in the 'LICENSES' directory does not have a" " '.txt' file extension. Please rename the file(s)" " accordingly." ) ) if self.missing_licenses: recommendations.append( _( "Fix missing licenses: For at least one of the license" " identifiers provided by the 'SPDX-License-Identifier'" " tags, there is no corresponding license text file in the" " 'LICENSES' directory. For SPDX license identifiers, you" " can simply run 'reuse download --all' to get any missing" " ones. For custom licenses (starting with 'LicenseRef-')," " you need to add these files yourself." ) ) if self.unused_licenses: recommendations.append( _( "Fix unused licenses: At least one of the license text" " files in 'LICENSES' is not referenced by any file, e.g." " by an 'SPDX-License-Identifier' tag. Please make sure" " that you either tag the accordingly licensed files" " properly, or delete the unused license text if you are" " sure that no file or code snippet is licensed as such." ) ) if self.read_errors: recommendations.append( _( "Fix read errors: At least one of the files in your" " directory cannot be read by the tool. Please check the" " file permissions. You will find the affected files at the" " top of the output as part of the logged error messages." ) ) if self.invalid_spdx_expressions: recommendations.append( _( "Fix invalid SPDX License Expressions: In one or more files" " there are SPDX License Expressions which cannot be" " parse. Check whether the value that follows" " 'SPDX-License-Identifier:' is correct. If the detected" " expression is not meant to be valid, put it between" " 'REUSE-IgnoreStart' and 'REUSE-IgnoreEnd' comments." ) ) if self.files_without_copyright or self.files_without_licenses: recommendations.append( _( "Fix missing copyright/licensing information: For one or" " more files, the tool cannot find copyright and/or" " licensing information. You typically do this by adding" " 'SPDX-FileCopyrightText' and 'SPDX-License-Identifier'" " tags to each file. The tutorial explains additional ways" " to do this: <https://reuse.software/tutorial/>" ) ) return recommendations
[docs] class ProjectSubsetReport: """Like a :class:`ProjectReport`, but for a subset of the files using a subset of features. """ def __init__(self) -> None: self.path: StrPath = "" self.read_errors: set[Path] = set() self.file_reports: set[FileReport] = set()
[docs] @classmethod def generate( cls, project: Project, subset_files: Collection[StrPath], multiprocessing: bool = _CPU_COUNT > 1, ) -> "ProjectSubsetReport": """Generate a :class:`ProjectSubsetReport` from a :class:`Project`. Args: project: The :class:`Project` to lint. subset_files: Only lint the files in this list. multiprocessing: Whether to use multiprocessing. """ subset_report = cls() subset_report.path = project.root results = _generate_file_reports( project, do_checksum=False, subset_files=subset_files, multiprocessing=multiprocessing, add_license_concluded=False, ) for result in results: if result.error: _process_error(result.error, result.path) subset_report.read_errors.add(Path(result.path)) continue file_report = cast(FileReport, result.report) subset_report.file_reports.add(file_report) return subset_report
@property def missing_licenses(self) -> dict[str, set[Path]]: """Files which refer to a license which do not exist in the LICENSES/ directory. """ result = defaultdict(set) for file_report in self.file_reports: for missing_license in file_report.missing_licenses: result[missing_license].add(file_report.path) return result @property def invalid_spdx_expressions(self) -> dict[Path, set[str]]: """Invalid expressions by file.""" return { file_report.path: file_report.invalid_spdx_expressions for file_report in self.file_reports if file_report.invalid_spdx_expressions } @property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" return { file_report.path for file_report in self.file_reports if not file_report.licenses_in_file } @property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" return { file_report.path for file_report in self.file_reports if not file_report.copyright } @property def is_compliant(self) -> bool: """Whether the report subset is compliant with the REUSE Spec.""" return not any( ( self.missing_licenses, self.files_without_copyright, self.files_without_licenses, self.read_errors, ) )
[docs] class FileReport: # pylint: disable=too-many-instance-attributes """Object that holds a linting report about a single file.""" def __init__(self, name: str, path: StrPath, do_checksum: bool = True): self.name = name self.path = Path(path) self.do_checksum = do_checksum self.reuse_infos: list[ReuseInfo] = [] self.spdx_id: str | None = None self.chk_sum: str | None = None self.licenses_in_file: list[str] = [] self.license_concluded: str = "" self.copyright: str = "" self.missing_licenses: set[str] = set() self.invalid_spdx_expressions: set[str] = set()
[docs] def to_dict_lint(self) -> dict[str, Any]: """Turn the report into a json-like dictionary with exclusively information relevant for linting. """ return { "path": PurePath(self.name).as_posix(), "copyrights": [ { "value": str(line), "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for line in reuse_info.copyright_notices ], "spdx_expressions": [ { "value": str(expression), "is_valid": expression.is_valid, "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for expression in reuse_info.spdx_expressions ], }
[docs] @classmethod def generate( cls, project: Project, path: StrPath, do_checksum: bool = True, add_license_concluded: bool = False, ) -> "FileReport": """Generate a FileReport from a path in a Project.""" # pylint: disable=too-many-branches path = Path(path) if not path.is_file(): raise OSError(f"{path} is not a file") relative = project.relative_from_root(path) report = cls(f"./{relative}", path, do_checksum=do_checksum) # Checksum and ID if report.do_checksum: report.chk_sum = _checksum(path) else: # This path avoids a lot of heavy computation, which is handy for # scenarios where you only need a unique hash, not a consistent # hash. report.chk_sum = f"{random.getrandbits(160):040x}" spdx_id = md5() spdx_id.update(report.name.encode("utf-8")) spdx_id.update(report.chk_sum.encode("utf-8")) report.spdx_id = f"SPDXRef-{spdx_id.hexdigest()}" reuse_infos = project.reuse_info_of(path) for reuse_info in reuse_infos: for expression in reuse_info.spdx_expressions: if not expression.is_valid: report.invalid_spdx_expressions.add(str(expression)) for identifier in expression.licenses: # A license expression akin to Apache-1.0+ should register # correctly if LICENSES/Apache-1.0.txt exists. identifiers = {identifier} if ( plus_identifier := _strip_plus_from_identifier( identifier ) ) != identifier: identifiers.add(plus_identifier) # Missing license if not identifiers.intersection(project.licenses): report.missing_licenses.add(identifier) # Add license to report. report.licenses_in_file.append(identifier) if not add_license_concluded: report.license_concluded = "NOASSERTION" elif not any(reuse_info.spdx_expressions for reuse_info in reuse_infos): report.license_concluded = "NONE" elif report.invalid_spdx_expressions: report.license_concluded = "NOASSERTION" else: # Merge all the license expressions together, wrapping them in # parentheses to make sure an expression doesn't spill into another # one. The extra parentheses will be removed by the roundtrip # through parse() -> simplify() -> render(). report.license_concluded = str( SpdxExpression.combine( list( expression for reuse_info in reuse_infos for expression in reuse_info.spdx_expressions ) ).simplify() ) # Copyright text report.copyright = "\n".join( map( str, sorted( line for reuse_info in reuse_infos for line in reuse_info.copyright_notices ), ) ) # Source of licensing and copyright info report.reuse_infos = reuse_infos return report
def __hash__(self) -> int: if self.chk_sum is not None: return hash(self.name + self.chk_sum) return super().__hash__()
[docs] def format_creator(creator: str | None) -> str: """Render the creator field based on the provided flag""" if creator is None: return "Anonymous ()" if "(" in creator and creator.endswith(")"): # The creator field already contains an email address return creator return creator + " ()"
# REUSE-IgnoreEnd