From 8ca2e06b72d8f5466c0a727433f04692c777c3a4 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Tue, 26 Jul 2022 12:19:55 -0500 Subject: [PATCH] ICU-21755 commit checker: add support for COMMIT_METADATA.md file - new option, --commit-metadata= with path to metadata file - new option, --fix-version=41 (used for SKIP sections) - scaffolding for 'bad commits' list - new module CommitMetadata with unit tests - sample file format TEST_COMMIT_METADATA.md - such commits are skipped - refactored the commit skipping part (formerly used for cherry pick skips) - add a report section for skipped commits - add a cache for JIRA queries (for dev use): --cache-for-dev "/tmp/cldr-commit-cache" - optional - add an 'excluded commits' section at the bottom - make sure commit metadata is used to update ticket IDs and messages. --- tools/commit-checker/Pipfile | 3 +- tools/commit-checker/TEST_COMMIT_METADATA.md | 25 ++++ tools/commit-checker/check.py | 135 +++++++++++++++---- tools/commit-checker/commit_metadata.py | 83 ++++++++++++ tools/commit-checker/test_commit_metadata.py | 65 +++++++++ 5 files changed, 287 insertions(+), 24 deletions(-) create mode 100644 tools/commit-checker/TEST_COMMIT_METADATA.md create mode 100644 tools/commit-checker/commit_metadata.py create mode 100644 tools/commit-checker/test_commit_metadata.py diff --git a/tools/commit-checker/Pipfile b/tools/commit-checker/Pipfile index 2abd5e63bf0..d7c9882fe4e 100644 --- a/tools/commit-checker/Pipfile +++ b/tools/commit-checker/Pipfile @@ -1,5 +1,5 @@ # Copyright (C) 2018 and later: Unicode, Inc. and others. -# License & terms of use: http://www.unicode.org/copyright.html +# License & terms of use: http://www.unicode.org/copyright.html # Author: shane@unicode.org [[source]] @@ -15,3 +15,4 @@ gitpython = "*" jira = "*" [dev-packages] +pytest = "*" diff --git a/tools/commit-checker/TEST_COMMIT_METADATA.md b/tools/commit-checker/TEST_COMMIT_METADATA.md new file mode 100644 index 00000000000..b6e35923c1c --- /dev/null +++ b/tools/commit-checker/TEST_COMMIT_METADATA.md @@ -0,0 +1,25 @@ +# - Commit Metadata + +### Copyright (C) 2022 and later: Unicode, Inc. and others. +### License & terms of use: http://www.unicode.org/copyright.html +### All lines starting with one hash (#) are significant +### All lines starting with a dash (-) are significant. +### +### The following list of commits is parsed by the ICU/CLDR commit checker. +### It has the following structure: +### `- ` + +- 00decafbad CLDR-0000 Bad commit message +- 50257a7a3eba7bbed634961a2aa74e77b12810bf - Early commit, no bug id +- 283dc84d81cfc47fb15cd664fce4b0151d070ea6 - Early commit, no bug id +- 02198373a591a15b804127acddd32582ec985b7e CLDR-15852 v42 merge commit + + +### The following are items to skip for a certain CLDR version +### Format: `# SKIP v00` followed by a list of commits to skip for that version + +# SKIP v41 + +- 56ca5d5 CLDR-14877 split ticket + + diff --git a/tools/commit-checker/check.py b/tools/commit-checker/check.py index 643ede3da09..104f7cfab83 100644 --- a/tools/commit-checker/check.py +++ b/tools/commit-checker/check.py @@ -9,11 +9,14 @@ import os import re import sys import datetime +from hashlib import sha256 +import pickle from enum import Enum from collections import namedtuple from git import Repo from jira import JIRA +from commit_metadata import CommitMetadata # singleCount = 0 @@ -110,6 +113,11 @@ flag_parser.add_argument( help = "Hostname of the Jira instance", default = "unicode-org.atlassian.net" ) +flag_parser.add_argument( + "--commit-metadata", + help = "path to COMMIT_METADATA.md", + default = "COMMIT_METADATA.md" +) flag_parser.add_argument( "--jira-username", help = "Username to use for authenticating to Jira", @@ -125,6 +133,16 @@ flag_parser.add_argument( help = "JQL query load tickets; this should match tickets expected to correspond to the commits being checked. Example: 'project=ICU and fixVersion=63.1'; set fixVersion to the upcoming version.", required = True ) +flag_parser.add_argument( + "--fix-version", + help = "Assumed Fix version. Used for commit metadata", + required = False +) +flag_parser.add_argument( + "--cache-for-dev", + help = "DEV USE ONLY: prefix of path to a jira cache", + required = False +) flag_parser.add_argument( "--github-url", help = "Base URL of the GitHub repo", @@ -171,19 +189,26 @@ def pretty_print_issue(issue, type=None, **kwargs): if issue.issue.fields.components and len(issue.issue.fields.components) > 0: print("\t- Component(s): " + (' '.join(sorted([str(component.name) for component in issue.issue.fields.components])))) -def get_commits(repo_root, rev_range, **kwargs): +def get_commits(commit_metadata_obj, repo_root, rev_range, **kwargs): """ Yields an ICUCommit for each commit in the user-specified rev-range. """ repo = Repo(repo_root) for commit in repo.iter_commits(rev_range): - match = re.search(r"^(\w+-\d+) ", commit.message) - if match: - issue_id = match.group(1) - # print("@@@ %s = %s / %s" % (issue_id, commit, commit.summary), file=sys.stderr) - yield ICUCommit(issue_id, commit) - else: - yield ICUCommit(None, commit) + issue_id = None + if commit_metadata_obj: + # read from the non-skip list + from_commit_metadata = commit_metadata_obj.get_commit_info(commit.hexsha, skip=None) + if from_commit_metadata: + (sha, issue_id, message) = from_commit_metadata + if message: + # Update message if found + commit.message = commit.message + "\nCOMMIT_METADATA: " + message + if not issue_id: + match = re.search(r"^(\w+-\d+) ", commit.message) + if match: + issue_id = match.group(1) + yield ICUCommit(issue_id, commit) def get_cherrypicked_commits(repo_root, rev_range, **kwargs): """ @@ -255,6 +280,40 @@ def get_jira_issues(jira_query, **kwargs): start += batch_size jira_issue_map = dict() # loaded in main() +commit_metadata = None +fixversion_to_skip = None + +def get_issue_cache_path(args): + return '%s-%s.pickle' % (args.cache_for_dev, sha256(args.jira_query.encode()).hexdigest()) + +def load_issues(args): + global jira_issue_ids, jira_issue_map, closed_jira_issue_ids + + if args.cache_for_dev: + issue_cache_path = get_issue_cache_path(args) + if os.path.exists(issue_cache_path): + with open(issue_cache_path, 'rb') as f: + (issues, jira_issue_ids, closed_jira_issue_ids, jira_issue_map) = pickle.load(f) + print("Loaded jira cache to " + issue_cache_path, file=sys.stderr) + return issues + # proceed to load + issues = list(get_jira_issues(**vars(args))) + # add all queried issues to the cache + for issue in issues: + jira_issue_map[issue.issue_id] = issue + # only the issue ids in-query + jira_issue_ids = set(issue.issue_id for issue in issues) + # only the closed issue ids in-query + closed_jira_issue_ids = set(issue.issue_id for issue in issues if issue.is_closed) + + return issues + +def save_issues_to_cache(args, issues): + if args.cache_for_dev: + issue_cache_path = get_issue_cache_path(args) + with open(issue_cache_path, 'wb') as f: + pickle.dump((issues, jira_issue_ids, closed_jira_issue_ids, jira_issue_map), f) + print("Wrote cache to " + issue_cache_path, file=sys.stderr) def get_single_jira_issue(issue_id, **kwargs): """ @@ -293,6 +352,7 @@ def print_sectionheader(section): #print("### %s%s" % (aname(section), section)) def main(): + global fixversion_to_skip args = flag_parser.parse_args() print("TIP: Have you pulled the latest main? This script only looks at local commits.", file=sys.stderr) if not args.jira_username or not args.jira_password: @@ -301,30 +361,40 @@ def main(): else: authenticated = True + if args.fix_version: + fixversion_to_skip = 'v%s' % args.fix_version + args.fixversion_to_skip = fixversion_to_skip + + if args.commit_metadata and os.path.exists(args.commit_metadata): + commit_metadata = CommitMetadata(args.commit_metadata) + else: + commit_metadata = CommitMetadata() # null + args.commit_metadata_obj = commit_metadata + # exclude these, already merged to old maint - excludeAlreadyMergedToOldMaint = get_cherrypicked_commits(**vars(args)) + exclude_already_merged_to_old_maint = get_cherrypicked_commits(**vars(args)) commits = list(get_commits(**vars(args))) - issues = list(get_jira_issues(**vars(args))) + + def exclude_commit_sha(sha) -> bool: + """Return true if this sha is excluded""" + exclude = (sha in exclude_already_merged_to_old_maint) or (commit_metadata.get_commit_info(sha, skip=fixversion_to_skip) is not None) + return exclude # commit_issue_ids is all commits in the git query. Excluding cherry exclusions. - commit_issue_ids = set(commit.issue_id for commit in commits if commit.issue_id is not None and commit.commit.hexsha not in excludeAlreadyMergedToOldMaint) + commit_issue_ids = set(commit.issue_id for commit in commits if commit.issue_id is not None and not exclude_commit_sha(commit.commit.hexsha)) # which issues have commits that were excluded - excluded_commit_issue_ids = set(commit.issue_id for commit in commits if commit.issue_id is not None and commit.commit.hexsha in excludeAlreadyMergedToOldMaint) + excluded_commit_issue_ids = set(commit.issue_id for commit in commits if commit.issue_id is not None and exclude_commit_sha(commit.commit.hexsha)) + excluded_commits = set(commit for commit in commits if exclude_commit_sha(commit.commit.hexsha)) # grouped_commits is all commits and issue_ids in the git query, regardless of issue status # but NOT including cherry exclusions grouped_commits = [ - (issue_id, [commit for commit in commits if commit.issue_id == issue_id and commit.commit.hexsha not in excludeAlreadyMergedToOldMaint]) + (issue_id, [commit for commit in commits if commit.issue_id == issue_id and not exclude_commit_sha(commit.commit.hexsha)]) for issue_id in sorted(commit_issue_ids) ] - # add all queried issues to the cache - for issue in issues: - jira_issue_map[issue.issue_id] = issue - # only the issue ids in-query - jira_issue_ids = set(issue.issue_id for issue in issues) - # only the closed issue ids in-query - closed_jira_issue_ids = set(issue.issue_id for issue in issues if issue.is_closed) + + issues = load_issues(args) # keep track of issues that we already said have no commit. no_commit_ids = set() @@ -376,7 +446,7 @@ def main(): no_commit_ids.add(issue.issue_id) pretty_print_issue(issue, type=CLOSED_NO_COMMIT, **vars(args)) if issue.issue_id in excluded_commit_issue_ids: - print("\t - **Note: Has cherry-picked commits. Fix Version may be wrong.**") + print("\t - **Note: Has excluded/cherry-picked commits. Fix Version may be wrong.**") print() if not found: print("*Success: No problems in this category!*") @@ -451,8 +521,6 @@ def main(): print("##### Commits with Issue %s" % issue_id) print() for commit in commits: - if(commit.commit.hexsha in excludeAlreadyMergedToOldMaint): - print("@@@ ALREADY MERGED") pretty_print_commit(commit, **vars(args)) print() if not found: @@ -542,5 +610,26 @@ def main(): print("## Total Problems: %s" % total_problems) print("## Issues under review: %s" % len(issues_in_review)) # not counted as a problem. + if fixversion_to_skip: + print("## fix version (for COMMIT_METADATA SKIP) %s" % fixversion_to_skip) + + if len(excluded_commits) > 0: + print() + print("## Excluded commits: %d" % len(excluded_commits)) + for commit in excluded_commits: + pretty_print_commit(commit, **vars(args)) + from_commit_metadata = commit_metadata.get_commit_info(commit.commit.hexsha, skip=fixversion_to_skip) + if from_commit_metadata: + (sha, id, message) = from_commit_metadata + # Print any notes in the skip list here. + if id: + print("\t- COMMIT_METADATA.md: %s" % id) + if message: + print("\t- COMMIT_METADATA.md: %s" % message) + if commit.commit.hexsha in exclude_already_merged_to_old_maint: + print("\t- NOTE: excluded due to already being merged to old maint") + # save out the dev cache here, so that any 1-off issues are also included in the cache + save_issues_to_cache(args, issues) + if __name__ == "__main__": main() diff --git a/tools/commit-checker/commit_metadata.py b/tools/commit-checker/commit_metadata.py new file mode 100644 index 00000000000..62c49c6a6ad --- /dev/null +++ b/tools/commit-checker/commit_metadata.py @@ -0,0 +1,83 @@ +# Copyright (C) 2022 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Author: srloomis@unicode.org + +from re import compile +from typing import Tuple +import typing + +commit_line = compile('^\s*-\s+([0-9a-f]+)\s+([^\s]+)\s+(.*)$') # '- ' +verb_line = compile('^\s*#\s+([A-Z]+)\s+(.*)$') # '# ACTION any other content…' + +class CommitMetadata(): + def __init__(self, metadata_file=None) -> None: + skipset = {} + fixset = [] # sha, id, message + current_skip = None + if metadata_file: + with open(metadata_file, 'r') as f: + for line in f.readlines(): + line = line.strip() + m = commit_line.match(line) + if m: + sha = m.group(1) + id = m.group(2) + message = m.group(3) + fixset.append((sha, id, message)) + if current_skip: + skipset[current_skip].append((sha, id, message)) + continue + m = verb_line.match(line) + if m: + action = m.group(1) + content = m.group(2) + # for now we only support SKIP + assert action == "SKIP", "Unknown action %s in %s (expected 'SKIP')" % (action, line) + current_skip = content + assert content not in skipset, "Error: two sections like %s" % line + skipset[content] = [] # initialize this + continue + self.fixset = fixset + self.skipset = skipset + pass + + def get_commit_info(self, commit, skip=None) -> typing.Tuple[str, str, str]: + """Return an override line given a commit + + Args: + commit (str): hash or short hash of a commit + skip (str): If set, a version to skip such as 'v41' (this will match a section - SKIP v41) + + Returns: + sha (str): original sha from line + id (str): ticket id or hash + message (str): override message + """ + if skip: + if not skip in self.skipset: + return None + return CommitMetadata.match_list(commit, self.skipset[skip]) + else: + return CommitMetadata.match_list(commit, self.fixset) + + @staticmethod + def match_list(commit, l) -> any: + """Find the first match in a list of commits + + Args: + commit (str): short or long commit string + l (list): list of tuples, where item 0 is the commit + + Returns: + any: matching list member, or None + """ + for i in l: + if CommitMetadata.match_commit(commit, i[0]): + return i + return None + + @staticmethod + def match_commit(h1, h2) -> bool: + """return true if the prefix of the hashes are the same""" + comm = min(len(h1), len(h2)) + return h1[0:comm] == h2[0:comm] diff --git a/tools/commit-checker/test_commit_metadata.py b/tools/commit-checker/test_commit_metadata.py new file mode 100644 index 00000000000..536286e0a08 --- /dev/null +++ b/tools/commit-checker/test_commit_metadata.py @@ -0,0 +1,65 @@ +# Copyright (C) 2022 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Author: srloomis@unicode.org + +from commit_metadata import CommitMetadata + +def test_hash(): + assert CommitMetadata.match_commit("512a679aba", "512a679aba22e144a4443df5f8f304e4f8b39054") + assert CommitMetadata.match_commit("512a679aba", "512a679aba") + assert CommitMetadata.match_commit("512a679aba22e144a4443df5f8f304e4f8b39054", "512a679aba22e144a4443df5f8f304e4f8b39054") + assert not CommitMetadata.match_commit("512a679aba", "16aae80199") + assert not CommitMetadata.match_commit("512a679aba22e144a4443df5f8f304e4f8b39054", "16aae80199") + +def test_matchcommit(): + assert CommitMetadata.match_list("512a679aba", [("00decafbad", 44), ("512a679aba", 99)]) == ("512a679aba", 99) + assert CommitMetadata.match_list("512a679aba22e144a4443df5f8f304e4f8b39054", [("00decafbad", 44), ("512a679aba", 99)]) == ("512a679aba", 99) + assert CommitMetadata.match_list("512a679aba", [("00decafbad", 44), ("512a679aba22e144a4443df5f8f304e4f8b39054", 99)]) == ("512a679aba22e144a4443df5f8f304e4f8b39054", 99) + assert not CommitMetadata.match_list("16aae80199", [("00decafbad", 44), ("512a679aba", 99)]) + assert not CommitMetadata.match_list("16aae80199", [("00decafbad", 44), ("512a679aba22e144a4443df5f8f304e4f8b39054", 99)]) + +def test_read(): + m = CommitMetadata(metadata_file="./TEST_COMMIT_METADATA.md") + assert m + + assert not m.get_commit_info('00000000') # not in list + assert m.get_commit_info('00decafbad') + assert m.get_commit_info('00decafbad')[1].startswith('CLDR-0000') + assert m.get_commit_info('56ca5d5') + assert m.get_commit_info('56ca5d5')[1].startswith('CLDR-14877') + # short or long, same + assert m.get_commit_info('56ca5d5') == m.get_commit_info('56ca5d563cf57990a7598f570cb9be51956cb9de') + + # skip list + assert m.get_commit_info('56ca5d5', skip='v41') + assert not m.get_commit_info('56ca5d5', skip='v42') + assert m.get_commit_info('56ca5d563cf57990a7598f570cb9be51956cb9de', skip='v41') + assert not m.get_commit_info('56ca5d563cf57990a7598f570cb9be51956cb9de', skip='v42') + assert not m.get_commit_info('00decafbad', 'v41') + +def test_null_read(): + m = CommitMetadata(metadata_file=None) + assert m + + # function with no info + assert not m.get_commit_info('00000000') # not in list + assert not m.get_commit_info('00decafbad') + assert not m.get_commit_info('56ca5d5') + assert not m.get_commit_info('56ca5d5', skip='v41') + assert not m.get_commit_info('56ca5d5', skip='v42') + +def test_parse_42(): + m = CommitMetadata(metadata_file="./TEST_COMMIT_METADATA.md") + assert m + + # no skip + info = m.get_commit_info('02198373a591a15b804127acddd32582ec985b7e') + assert info + assert info[0] == '02198373a591a15b804127acddd32582ec985b7e' + assert info[1] == 'CLDR-15852' + assert info[2] == 'v42 merge commit' + + # skip + info = m.get_commit_info('02198373a591a15b804127acddd32582ec985b7e', skip='v42') + # not found because it isn't in SKIP v42 + assert not info