summaryrefslogtreecommitdiffstats
path: root/meta/lib/oe/license_finder.py
blob: 16f5d7c94cb6f08f5460b143d3b50a9593553fd4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#
# Copyright OpenEmbedded Contributors
#
# SPDX-License-Identifier: GPL-2.0-only
#

import fnmatch
import hashlib
import logging
import os
import re

import bb
import bb.utils

logger = logging.getLogger("BitBake.OE.LicenseFinder")

def _load_hash_csv(d):
    """
    Load a mapping of (checksum: license name) from all files/license-hashes.csv
    files that can be found in the available layers.
    """
    import csv
    md5sums = {}

    # Read license md5sums from csv file
    for path in d.getVar('BBPATH').split(':'):
        csv_path = os.path.join(path, 'files', 'license-hashes.csv')
        if os.path.isfile(csv_path):
            with open(csv_path, newline='') as csv_file:
                reader = csv.DictReader(csv_file, delimiter=',', fieldnames=['md5sum', 'license'])
                for row in reader:
                    md5sums[row['md5sum']] = row['license']

    return md5sums


def _crunch_known_licenses(d):
    """
    Calculate the MD5 checksums for the original and "crunched" versions of all
    known licenses.
    """
    md5sums = {}

    lic_dirs = [d.getVar('COMMON_LICENSE_DIR')] + (d.getVar('LICENSE_PATH') or "").split()
    for lic_dir in lic_dirs:
        for fn in os.listdir(lic_dir):
            path = os.path.join(lic_dir, fn)
            # Hash the exact contents
            md5value = bb.utils.md5_file(path)
            md5sums[md5value] = fn
            # Also hash a "crunched" version
            md5value = _crunch_license(path)
            md5sums[md5value] = fn

    return md5sums


def _crunch_license(licfile):
    '''
    Remove non-material text from a license file and then calculate its
    md5sum. This works well for licenses that contain a copyright statement,
    but is also a useful way to handle people's insistence upon reformatting
    the license text slightly (with no material difference to the text of the
    license).
    '''

    import oe.utils

    # Note: these are carefully constructed!
    license_title_re = re.compile(r'^#*\(? *(This is )?([Tt]he )?.{0,15} ?[Ll]icen[sc]e( \(.{1,10}\))?\)?[:\.]? ?#*$')
    license_statement_re = re.compile(r'^((This (project|software)|.{1,10}) is( free software)? (released|licen[sc]ed)|(Released|Licen[cs]ed)) under the .{1,10} [Ll]icen[sc]e:?$')
    copyright_re = re.compile(r'^ *[#\*]* *(Modified work |MIT LICENSED )?Copyright ?(\([cC]\))? .*$')
    disclaimer_re = re.compile(r'^ *\*? ?All [Rr]ights [Rr]eserved\.$')
    email_re = re.compile(r'^.*<[\w\.-]*@[\w\.\-]*>$')
    header_re = re.compile(r'^(\/\**!?)? ?[\-=\*]* ?(\*\/)?$')
    tag_re = re.compile(r'^ *@?\(?([Ll]icense|MIT)\)?$')
    url_re = re.compile(r'^ *[#\*]* *https?:\/\/[\w\.\/\-]+$')

    lictext = []
    with open(licfile, 'r', errors='surrogateescape') as f:
        for line in f:
            # Drop opening statements
            if copyright_re.match(line):
                continue
            elif disclaimer_re.match(line):
                continue
            elif email_re.match(line):
                continue
            elif header_re.match(line):
                continue
            elif tag_re.match(line):
                continue
            elif url_re.match(line):
                continue
            elif license_title_re.match(line):
                continue
            elif license_statement_re.match(line):
                continue
            # Strip comment symbols
            line = line.replace('*', '') \
                       .replace('#', '')
            # Unify spelling
            line = line.replace('sub-license', 'sublicense')
            # Squash spaces
            line = oe.utils.squashspaces(line.strip())
            # Replace smart quotes, double quotes and backticks with single quotes
            line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
            # Unify brackets
            line = line.replace("{", "[").replace("}", "]")
            if line:
                lictext.append(line)

    m = hashlib.md5()
    try:
        m.update(' '.join(lictext).encode('utf-8'))
        md5val = m.hexdigest()
    except UnicodeEncodeError:
        md5val = None
    return md5val


def find_license_files(srctree, first_only=False):
    """
    Search srctree for files that look like they could be licenses.
    If first_only is True, only return the first file found.
    """
    licspecs = ['*LICEN[CS]E*', 'COPYING*', '*[Ll]icense*', 'LEGAL*', '[Ll]egal*', '*GPL*', 'README.lic*', 'COPYRIGHT*', '[Cc]opyright*', 'e[dp]l-v10']
    skip_extensions = (".html", ".js", ".json", ".svg", ".ts", ".go", ".sh")
    licfiles = []
    for root, dirs, files in os.walk(srctree):
        # Sort files so that LICENSE is before LICENSE.subcomponent, which is
        # meaningful if first_only is set.
        for fn in sorted(files):
            if fn.endswith(skip_extensions):
                continue
            for spec in licspecs:
                if fnmatch.fnmatch(fn, spec):
                    fullpath = os.path.join(root, fn)
                    if not fullpath in licfiles:
                        licfiles.append(fullpath)
                        if first_only:
                            return licfiles

    return licfiles


def match_licenses(licfiles, srctree, d, extra_hashes={}):
    md5sums = {}
    md5sums.update(_load_hash_csv(d))
    md5sums.update(_crunch_known_licenses(d))
    md5sums.update(extra_hashes)

    licenses = []
    for licfile in sorted(licfiles):
        resolved_licfile = d.expand(licfile)
        md5value = bb.utils.md5_file(resolved_licfile)
        license = md5sums.get(md5value, None)
        if not license:
            crunched_md5 = _crunch_license(resolved_licfile)
            license = md5sums.get(crunched_md5, None)
            if not license:
                license = 'Unknown'
                logger.info("Please add the following line for '%s' to a 'license-hashes.csv' " \
                    "and replace `Unknown` with the license:\n" \
                    "%s,Unknown" % (os.path.relpath(licfile, srctree + "/.."), md5value))

        licenses.append((license, os.path.relpath(licfile, srctree), md5value))

    return licenses


def find_licenses(srctree, d, first_only=False, extra_hashes={}):
    licfiles = find_license_files(srctree, first_only)
    licenses = match_licenses(licfiles, srctree, d, extra_hashes)

    # FIXME should we grab at least one source file with a license header and add that too?

    return licenses