# cruft -- lintian check script -*- perl -*- # # based on debhelper check, # Copyright © 1999 Joey Hess # Copyright © 2000 Sean 'Shaleh' Perry # Copyright © 2002 Josip Rodin # Copyright © 2007 Russ Allbery # Copyright © 2013-2018 Bastien ROUCARIÈS # Copyright © 2017-2020 Chris Lamb # Copyright © 2020-2021 Felix Lechner # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, you can find it on the World Wide # Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301, USA. package Lintian::Check::Cruft; use v5.20; use warnings; use utf8; use Const::Fast; use File::Basename qw(basename); use List::SomeUtils qw(any none first_value); use List::UtilsBy qw(max_by); use Path::Tiny; use Unicode::UTF8 qw(encode_utf8); use Lintian::Relation; use Lintian::Util qw(normalize_pkg_path); use Moo; use namespace::clean; with 'Lintian::Check'; # Half of the size used in the "sliding window" for detecting bad # licenses like GFDL with invariant sections. # NB: Keep in sync cruft-gfdl-fp-sliding-win/pre_build. # not less than 8192 for source missing const my $LARGE_BLOCK_SIZE => 16_384; const my $SMALL_BLOCK_SIZE => 8_192; # very long line lengths const my $VERY_LONG_LINE_LENGTH => 512; const my $SAFE_LINE_LENGTH => 256; const my $EMPTY => q{}; const my $ASTERISK => q{*}; const my $DOLLAR => q{$}; const my $DOT => q{.}; const my $DOUBLE_DOT => q{..}; const my $LICENSE_CHECK_DATA_FIELDS => 5; const my $ITEM_NOT_FOUND => -1; const my $SKIP_HTML => -1; # prebuilt-file or forbidden file type has RFC_WHITELIST => ( is => 'rw', lazy => 1, default => sub { my ($self) = @_; return $self->profile->load_data( 'cruft/rfc-whitelist', qr/\s*\~\~\s*/, sub { return qr/$_[0]/xms; }); }); # get browserified regexp has BROWSERIFY_REGEX => ( is => 'rw', lazy => 1, default => sub { my ($self) = @_; return $self->profile->load_data( 'cruft/browserify-regex', qr/\s*\~\~\s*/, sub { return qr/$_[1]/xms; }); }); my %NVIDIA_LICENSE = ( keywords => [qw{license intellectual retain property}], sentences =>[ 'retain all intellectual property and proprietary rights in and to this software and related documentation' ]); my %NON_FREE_LICENSES = ( # first field is tag # second field is a list of keywords in lower case # third field are lower case sentences to match the license. Notes that space are normalized before and formatting removed # fourth field is a regex to use to match the license, use lower case and [ ] for space. # 5th field is a function to call if the field 2th to 5th match. # (see dispatch table %LICENSE_CHECK_DISPATCH_TABLE # json license 'license-problem-json-evil' => { keywords => [qw{software evil good}], sentences => ['software shall be used for good'], regex => qr{software [ ] shall [ ] be [ ] used [ ] for [ ] good [ ]? ,? [ ]? not [ ] evil}msx }, # non free RFC old version 'license-problem-non-free-RFC' => { keywords => [qw{document purpose translate language}], sentences => ['this document itself may not be modified in any way'], regex => qr/this [ ] document [ ] itself [ ] may [ ] not [ ] be [ ] modified [ ] in [ ] any [ ] way [ ]?, [ ]? such [ ] as [ ] by [ ] removing [ ] the [ ] copyright [ ] notice [ ] or [ ] references [ ] to [ ] .{0,256} [ ]? except [ ] as [ ] needed [ ] for [ ] the [ ] purpose [ ] of [ ] developing [ ] .{0,128} [ ]? in [ ] which [ ] case [ ] the [ ] procedures [ ] for [ ] copyrights [ ] defined [ ] in [ ] the [ ] .{0,128} [ ]? process [ ] must [ ] be [ ] followed[ ]?,[ ]? or [ ] as [ ] required [ ] to [ ] translate [ ] it [ ] into [ ] languages [ ]/msx, callsub => 'rfc_whitelist_filename' }, 'license-problem-non-free-RFC-BCP78' => { keywords => [qw{license document bcp restriction}], sentences => ['bcp 78'], regex => qr{this [ ] document [ ] is [ ] subject [ ] to [ ] (?:the [ ] rights [ ]?, [ ] licenses [ ] and [ ]restrictions [ ] contained [ ] in [ ])? bcp [ ] 78}msx, callsub => 'rfc_whitelist_filename' }, # check GFDL block - The ".{0,1024}"-part in the regex # will contain the "no invariants etc." part if # it is a good use of the license. We include it # here to ensure that we do not emit a false positive # if the "redeeming" part is in the next block # keyword document is here in order to benefit for other license keyword and a shortcut for documentation 'license-problem-gfdl-invariants' => { keywords => [qw{license document gnu copy documentation}], sentences => ['gnu free documentation license'], regex => qr/(?'rawcontextbefore'(?:(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){1024}|\A(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){0,1024}|(?:[ ] copy [ ] of [ ] the [ ] license [ ] is.{0,1024}?))) gnu [ ] free [ ] documentation [ ] license (?'rawgfdlsections'(?:(?!gnu [ ] free [ ] documentation [ ] license).){0,1024}?) (?:a [ ] copy [ ] of [ ] the [ ] license [ ] is|this [ ] document [ ] is [ ] distributed)/msx, callsub => 'check_gfdl_license_problem' }, # php license 'license-problem-php-license' => { keywords => [qw{www.php.net group\@php.net phpfoo conjunction php}], sentences => ['this product includes php'], regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 3(?:\.\d+)?}msx, callsub => 'php_source_whitelist' }, 'license-problem-bad-php-license' => { keywords => [qw{www.php.net add-on conjunction}], sentences => ['this product includes php'], regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 2(?:\.\d+)?}msx, callsub => 'php_source_whitelist' }, # cc by nc sa note that " is replaced by [ ] 'license-problem-cc-by-nc-sa' => { keywords => [qw{license by-nc-sa creativecommons.org}], sentences => [ '://creativecommons.org/licenses/by-nc-sa', 'under attribution-noncommercial' ], regex => qr{(?:license [ ] rdf:[^=:]+=[ ]* (?:ht|f)tps?://(?:[^/.]\.)??creativecommons\.org/licenses/by-nc-sa/\d+(?:\.\d+)?(?:/[[:alpha:]]+)?/? [ ]* >|available [ ] under [ ] attribution-noncommercial)}msx }, # not really a license but warn it: visual c++ generated file 'source-contains-autogenerated-visual-c++-file' => { keywords => [qw{microsoft visual generated}], sentences => ['microsoft visual c++ generated'], regex => qr{microsoft [ ] visual [ ] c[+][+] [ ] generated (?![ ] by [ ] freeze\.py)}msx }, # not really a license but warn about it: gperf generated file 'source-contains-autogenerated-gperf-data' => { keywords => [qw{code produced gperf version}], sentences => ['code produced by gperf version'], regex => qr{code [ ] produced [ ] by [ ] gperf [ ] version [ ] \d+\.\d+}msx }, # warn about copy of ieee-data 'source-contains-data-from-ieee-data-oui-db' => { keywords => [qw{struck scitex racore}], sentences => ['dr. b. struck'], regex => qr{dr. [ ] b. [ ] struck}msx }, # warn about unicode license for utf for convert utf 'license-problem-convert-utf-code' => { keywords => [qw{fall-through bytestowrite utf-8}], sentences => ['the fall-through switches in utf-8 reading'], regex => qr{the [ ] fall-through [ ] switches [ ] in [ ] utf-8 [ ] reading [ ] code [ ] save}msx }); # get usual data about admissible/not admissible GFDL invariant part of license has GFDL_FRAGMENTS => ( is => 'rw', lazy => 1, default => sub { my ($self) = @_; return $self->profile->load_data( 'cruft/gfdl-license-fragments-checks', qr/\s*\~\~\s*/, sub { my ($gfdlsectionsregex,$secondpart) = @_; # allow empty parameters $gfdlsectionsregex //= $EMPTY; # trim both ends $gfdlsectionsregex =~ s/^\s+|\s+$//g; $secondpart //= $EMPTY; my ($acceptonlyinfile,$applytag) = split(/\s*\~\~\s*/, $secondpart, 2); $acceptonlyinfile //= $EMPTY; $applytag //= $EMPTY; # trim both ends $acceptonlyinfile =~ s/^\s+|\s+$//g; $applytag =~ s/^\s+|\s+$//g; # empty first field is everything if (length($gfdlsectionsregex) == 0) { $gfdlsectionsregex = $DOT . $ASTERISK; } # empty regname is none if (length($acceptonlyinfile) == 0) { $acceptonlyinfile = $DOT . $ASTERISK; } my %ret = ( 'gfdlsectionsregex' => qr/$gfdlsectionsregex/xis, 'acceptonlyinfile' => qr/$acceptonlyinfile/xs, ); unless ($applytag eq $EMPTY) { $ret{'tag'} = $applytag; } return \%ret; }); }); # Directory checks. These regexes match a directory that shouldn't be in the # source package and associate it with a tag (minus the leading # source-contains or debian-adds). Note that only one of these regexes # should trigger for any single directory. my @directory_checks = ( [qr{^(.+/)?CVS/?$} => 'cvs-control-dir'], [qr{^(.+/)?\.svn/?$} => 'svn-control-dir'], [qr{^(.+/)?\.bzr/?$} => 'bzr-control-dir'], [qr{^(.+/)?\{arch\}/?$} => 'arch-control-dir'], [qr{^(.+/)?\.arch-ids/?$} => 'arch-control-dir'], [qr{^(.+/)?,,.+/?$} => 'arch-control-dir'], [qr{^(.+/)?\.git/?$} => 'git-control-dir'], [qr{^(.+/)?\.hg/?$} => 'hg-control-dir'], [qr{^(.+/)?\.be/?$} => 'bts-control-dir'], [qr{^(.+/)?\.ditrack/?$} => 'bts-control-dir'], # Special case (can only be triggered for diffs) [qr{^(.+/)?\.pc/?$} => 'quilt-control-dir'], ); # File checks. These regexes match files that shouldn't be in the source # package and associate them with a tag (minus the leading source-contains or # debian-adds). Note that only one of these regexes should trigger for any # given file. my @file_checks = ( [qr{^(.+/)?svn-commit\.(.+\.)?tmp$} => 'svn-commit-file'], [qr{^(.+/)?svk-commit.+\.tmp$} => 'svk-commit-file'], [qr{^(.+/)?\.arch-inventory$} => 'arch-inventory-file'], [qr{^(.+/)?\.hgtags$} => 'hg-tags-file'], [qr{^(.+/)?\.\#(.+?)\.\d+(\.\d+)*$} => 'cvs-conflict-copy'], [qr{^(.+/)?(.+?)\.(r[1-9]\d*)$} => 'svn-conflict-file'], [qr{\.(orig|rej)$} => 'patch-failure-file'], [qr{((^|/)[^/]+\.swp|~)$} => 'editor-backup-file'], ); sub visit_patched_files { my ($self, $item) = @_; return unless $item->is_file; # license string in debian/changelog are probably just change # Ignore these strings in d/README.{Debian,source}. If they # appear there it is probably just "file XXX got removed # because of license Y". $self->full_text_check($item) unless $item->name eq 'debian/changelog' && $item->name eq 'debian/README.Debian' && $item->name eq 'debian/README.source'; # prebuilt-file or forbidden file type $self->hint('source-contains-prebuilt-wasm-binary', $item->name) if $item->file_info =~ m{^WebAssembly \s \(wasm\) \s binary \s module}x; $self->hint('source-contains-prebuilt-windows-binary', $item->name) if $item->file_info =~ m{\b(?:PE(?:32|64)|(?:MS-DOS|COM)\s executable)\b}x; $self->hint('source-contains-prebuilt-silverlight-object', $item->name) if $item->file_info =~ m{^Zip \s archive \s data}x && $item->name =~ m{(?i)\.xac$}x; if ($item->file_info =~ m{^python \s \d(\.\d+)? \s byte-compiled}x) { $self->hint('source-contains-prebuilt-python-object', $item->name); $self->hint('source-is-missing', $item->name) unless $self->find_source($item, {'.py' => '(?i)(?:\.cpython-\d{2}|\.pypy)?\.py[co]$'}); } if ($item->file_info =~ m{\bELF\b}x) { $self->hint('source-contains-prebuilt-binary', $item->name); my %patterns = map { $_ => '(?i)(?:[\.-](?:bin|elf|e|hs|linux\d+|oo?|or|out|so(?:\.\d+)*)|static|_o\.golden)?$' } qw(.asm .c .cc .cpp .cxx .f .F .i .ml .rc .S); $self->hint('source-is-missing', $item->name) unless $self->find_source($item, \%patterns); } if ($item->file_info =~ m{^Macromedia \s Flash}x) { $self->hint('source-contains-prebuilt-flash-object', $item->name); $self->hint('source-is-missing', $item->name) unless $self->find_source($item, {'.as' => '(?i)\.swf$'}); } if ( $item->file_info =~ m{^Composite \s Document \s File}x && $item->name =~ m{(?i)\.fla$}x) { $self->hint('source-contains-prebuilt-flash-project', $item->name); $self->hint('source-is-missing', $item->name) unless $self->find_source($item, {'.as' => '(?i)\.fla$'}); } # do not forget to change also $JS_EXT in file.pm if ($item->name =~ m{(?i)[-._](?:compiled|compressed|lite|min|pack(?:ed)?|prod|umd|yc)\.js$}x ) { $self->hint('source-contains-prebuilt-javascript-object', $item->name); my %patterns = map { $_ => '(?i)(?:[-._](?:compiled|compressed|lite|min|pack(?:ed)?|prod|umd|yc))?\.js$' } qw(.js _orig.js .js.orig .src.js -src.js .debug.js -debug.js -nc.js); $self->hint('source-is-missing', $item->name) unless $self->find_source($item, \%patterns); } return; } sub source { my ($self) = @_; my @added_by_debian; my $prefix; if ($self->processable->native) { @added_by_debian = @{$self->processable->patched->sorted_list}; $prefix = 'source-contains'; } else { my $patched = $self->processable->patched; my $orig = $self->processable->orig; @added_by_debian = grep { !defined $orig->lookup($_->name) } @{$patched->sorted_list}; # remove root quilt control folder and all paths in it # created when 3.0 (quilt) source packages are unpacked @added_by_debian = grep { $_->name !~ m{^.pc/} } @added_by_debian if $self->processable->source_format eq '3.0 (quilt)'; my @common_items = grep { defined $orig->lookup($_->name) } @{$patched->sorted_list}; my @touched_by_debian = grep { $_->md5sum ne $orig->lookup($_->name)->md5sum } @common_items; $self->hint('no-debian-changes') unless @added_by_debian || @touched_by_debian; $prefix = 'debian-adds'; } # ignore lintian test set; should use automatic loop in the future @added_by_debian = grep { $_->name !~ m{^t/} } @added_by_debian if $self->processable->source_name eq 'lintian'; my @directories = grep { $_->is_dir } @added_by_debian; for my $directory (@directories) { my $rule = first_value { $directory->name =~ /$_->[0]/s } @directory_checks; $self->hint("${prefix}-$rule->[1]", $directory->name) if defined $rule; } my @files = grep { $_->is_file } @added_by_debian; for my $file (@files) { my $rule = first_value { $file->name =~ /$_->[0]/s } @file_checks; $self->hint("${prefix}-$rule->[1]", $file->name) if defined $rule; } return; } sub find_source { my ($self, $file, $patternref) = @_; $patternref //= {}; return undef unless $file->is_regular_file; return undef if $self->processable->is_non_free; my %patterns = %{$patternref}; my @alternatives; for my $replacement (keys %patterns) { my $newname = $file->basename; # empty pattern would repeat the last regex compiled my $pattern = $patterns{$replacement}; $newname =~ s/$pattern/$replacement/ if length $pattern; push(@alternatives, $newname) if length $newname; } my $index = $self->processable->patched; my @candidates; # add standard locations push(@candidates, $index->resolve_path('debian/missing-sources/' . $file->name)); push(@candidates, $index->resolve_path('debian/missing-sources/' . $file->basename)); my $dirname = $file->dirname; my $parentname = basename($dirname); my @absolute = ( # libtool '.libs', ".libs/$dirname", # mathjax 'unpacked', # for missing source set in debian 'debian', 'debian/missing-sources', "debian/missing-sources/$dirname" ); for my $absolute (@absolute) { push(@candidates, $index->resolve_path("$absolute/$_")) for @alternatives; } my @relative = ( # likely in current dir $DOT, # for binary object built by libtool $DOUBLE_DOT, # maybe in src subdir './src', # maybe in ../src subdir '../src', "../../src/$parentname", # emscripten './flash-src/src/net/gimite/websocket', ); for my $relative (@relative) { push(@candidates, $file->resolve_path("$relative/$_")) for @alternatives; } my @found = grep { defined } @candidates; # careful with behavior around empty arrays my $source = first_value { $_->name ne $file->name } @found; return $source; } # do basic license check against well known offender # note that it does not replace licensecheck(1) # and is only used for autoreject by ftp-master sub full_text_check { my ($self, $item) = @_; my $contents = $item->decoded_utf8; return unless length $contents; my ($maximum, $position) = $self->maximum_line_length($contents); $self->hint('very-long-line-length-in-source-file',$item->name, "line $position is $maximum characters long (>$VERY_LONG_LINE_LENGTH)") if $maximum > $VERY_LONG_LINE_LENGTH && $item->file_info !~ m{SVG Scalable Vector Graphics image}; my $lowercase = lc($contents); my $clean = clean_text($lowercase); # Check for non-distributable files - this # applies even to non-free, as we still need # permission to distribute those. # nvdia opencv infamous license return if $self->check_for_single_bad_license($item, $lowercase, $clean, 'license-problem-nvidia-intellectual', \%NVIDIA_LICENSE); unless ($self->processable->is_non_free) { for my $tag_name (keys %NON_FREE_LICENSES) { return if $self->check_for_single_bad_license($item, $lowercase, $clean, $tag_name, $NON_FREE_LICENSES{$tag_name}); } } $self->check_html_cruft($item, $lowercase) if $item->basename =~ /\.(?:x?html?\d?|xht)$/i; if ($self->_is_javascript_but_not_minified($item->name)) { # exception sphinx documentation if ($item->basename eq 'searchindex.js') { if ($lowercase =~ m/\A\s*search\.setindex\s* \s* \(\s*\{/xms) { $self->hint('source-contains-prebuilt-sphinx-documentation', $item->dirname); return; } } if ($item->basename eq 'search_index.js') { if ($lowercase =~ m/\A\s*var\s*search_index\s*=/xms) { $self->hint('source-contains-prebuilt-pandoc-documentation', $item->dirname); return; } } # false positive in dx package at least elsif ($item->basename eq 'srchidx.js') { return if $lowercase=~ m/\A\s*profiles \s* = \s* new \s* Array\s*\(/xms; } # see #745152 # Be robust check also .js elsif ($item->basename eq 'deployJava.js') { if ($lowercase =~ m/(?:\A|\v)\s*var\s+deployJava\s*=\s*function/xmsi) { $self->hint('source-is-missing', $item->name) unless $self->find_source($item, {'.txt' => '(?i)\.js$', $EMPTY => $EMPTY}); return; } } # https://github.com/rafaelp/css_browser_selector is actually the # original source. (#874381) elsif ($lowercase =~ m/css_browser_selector\(/) { return; } # Avoid false-positives in Jush's syntax highlighting definition files. elsif ($lowercase =~ m/jush\.tr\./) { return; } # now search hidden minified $self->warn_long_lines($item, $lowercase); } # search link rel header if ($lowercase =~ / \Q rel="copyright" \E /msx) { my $href = $lowercase; $href =~ m{}xmsi; my $url = $1 // $EMPTY; $self->hint('license-problem-cc-by-nc-sa', $item->name) if $url =~ m{^https?://creativecommons.org/licenses/by-nc-sa/}; } return; } # check javascript in html file sub check_html_cruft { my ($self, $item, $lowercase) = @_; my $blockscript = $lowercase; my $indexscript; while (($indexscript = index($blockscript, ' $ITEM_NOT_FOUND) { $blockscript = substr($blockscript,$indexscript); # sourced script ok if ($blockscript =~ m{\A]*?src="[^"]+?"[^>]*?>}sm) { $blockscript = substr($blockscript,$+[0]); next; } # extract script if ($blockscript =~ m{]*?>(.*?)}sm) { $blockscript = substr($blockscript,$+[0]); my $lcscript = $1; $self->check_js_script($item, $lcscript); return 0 if $self->warn_long_lines($item, $lcscript); next; } # here we know that we have partial script. Do the check nevertheless # first check if we have the full