improve linter for diagnostics

Message ID 7b011f04-43c3-9c24-7295-6e6358fea3a9@gmx.de
State New
Headers show
Series
  • improve linter for diagnostics
Related show

Commit Message

Roland Illig May 14, 2019, 8:37 p.m.
The linter in contrib/ checks all messages that the GCC user may see
whether they conform roughly to the GCC Guidelines for Diagnostics.

	* contrib/check-internal-format-escaping.py: improve output of
	the linter by providing short rationales for the warnings;
	detect space before punctuation; temporarily disable the most
	frequent diagnostics, in order to first focus on the ones that
	can be fixed more quickly

Patch

Index: contrib/check-internal-format-escaping.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- contrib/check-internal-format-escaping.py	(revision c7bf741f801cfbdf56bf1ee11a74ca0abe1e8c16)
+++ contrib/check-internal-format-escaping.py	(date 1557864788799)
@@ -41,18 +41,19 @@ 
          diagnostic_id: str, diagnostic: str, include_msgid=True):
     """
     To suppress a warning for a particular message,
-    add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
+    add a comment "# gcclint:ignore:{diagnostic_id}" to the message.
     """
 
-    if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
+    marker = f'gcclint:ignore:{diagnostic_id}'
+    if marker in msg.comment:
         return
 
-    seen_warnings[diagnostic] += 1
+    seen_warnings[diagnostic_id] += 1
 
     if include_msgid:
-        print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
+        print(f'{location(msg)}: {diagnostic} in {msg.msgid!r} [{marker}]')
     else:
-        print(f'{location(msg)}: {diagnostic}')
+        print(f'{location(msg)}: {diagnostic} [{marker}]')
 
 
 def lint_gcc_internal_format(msg: polib.POEntry):
@@ -68,6 +69,9 @@ 
         before = msgid[:m.start(0)]
         return before.count("%<") == before.count("%>")
 
+    def find_outside_quotes(regex: str, s: str):
+        return filter(outside_quotes, re.finditer(regex, s))
+
     def lint_matching_placeholders():
         """
         Warns when literal values in placeholders are not exactly equal
@@ -89,36 +93,31 @@ 
         in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
 
         if set(in_msgid) != set(in_msgstr):
-            warn(msg,
-                 'placeholder-mismatch',
+            warn(msg, 'placeholder-mismatch',
                  f'placeholder mismatch: msgid has {in_msgid}, '
                  f'msgstr has {in_msgstr}',
                  include_msgid=False)
 
     def lint_option_outside_quotes():
-        for match in re.finditer(r'\S+', msgid):
+        for match in find_outside_quotes(r'\S+', msgid):
             part = match.group()
-            if not outside_quotes(match):
-                continue
 
-            if part.startswith('-'):
-                if len(part) >= 2 and part[1].isalpha():
-                    if part == '-INF':
-                        continue
-
-                    warn(msg,
-                         'option-outside-quotes',
-                         'command line option outside %<quotes%>')
+            if len(part) >= 2 and part[0] == '-' \
+                    and part[1].isalpha() and part != '-INF':
+                warn(msg, 'option-outside-quotes',
+                     'command line options '
+                     'should be enclosed in %<quotes%>')
 
             if part.startswith('__builtin_'):
-                warn(msg,
-                     'builtin-outside-quotes',
-                     'builtin function outside %<quotes%>')
+                warn(msg, 'builtin-outside-quotes',
+                     'names of builtin functions '
+                     'should be enclosed in %<quotes%>')
 
     def lint_plain_apostrophe():
-        for match in re.finditer("[^%]'", msgid):
-            if outside_quotes(match):
-                warn(msg, 'apostrophe', 'apostrophe without leading %')
+        if any(find_outside_quotes("[^%]'", msgid)):
+            warn(msg, 'apostrophe',
+                 'apostrophes should be written as %\' '
+                 'to make them typographically correct')
 
     def lint_space_before_quote():
         """
@@ -129,26 +128,45 @@ 
 
         for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
             if match.group(1) != '%s':
-                warn(msg,
-                     'no-space-before-quote',
-                     '%< directly following a letter or digit')
+                warn(msg, 'space-before-quote',
+                     'a letter or digit should not be '
+                     'directly followed by a %<')
+
+    def lint_space_before_punctuation():
+        """
+        In written English, there is no space before a colon.
+        In text enclosed in %<quotes%> it's ok though.
+        """
+
+        for match in find_outside_quotes(" [.,:;?]", msgid):
+            warn(msg, 'space-before-punctuation',
+                 f'there should be no space before punctuation '
+                 f'{match.group()!r}')
 
     def lint_underscore_outside_quotes():
         """
         An underscore outside of quotes is used in several contexts,
-        and many of them violate the GCC Guidelines for Diagnostics:
+        and several of them violate the GCC Guidelines for Diagnostics
+        by not speaking in terms of the language being compiled:
 
         * names of GCC-internal compiler functions
         * names of GCC-internal data structures
-        * static_cast and the like (which are legitimate)
+
+        In some cases the underscore is used legitimately.
+        It could be quoted but often doesn't need to since it is
+        visually distinct enough from plain English text. Often
+        there are similar messages though, using keywords that are
+        valid English words, too. This can be confusing for both
+        GCC users and translators.
+
+        * static_cast and related C++ casts
+        * x86_64
         """
 
-        for match in re.finditer("_", msgid):
-            if outside_quotes(match):
-                warn(msg,
-                     'underscore-outside-quotes',
-                     'underscore outside of %<quotes%>')
-                return
+        for _ in find_outside_quotes("_", msgid):
+            warn(msg, 'underscore-outside-quotes',
+                 'underscore outside of %<quotes%>')
+            return
 
     def lint_may_not():
         """
@@ -158,31 +176,23 @@ 
         """
 
         if re.search(r'\bmay not\b', msgid):
-            warn(msg,
-                 'ambiguous-may-not',
+            warn(msg, 'ambiguous-may-not',
                  'the term "may not" is ambiguous')
 
     def lint_unbalanced_quotes():
         if msgid.count("%<") != msgid.count("%>"):
-            warn(msg,
-                 'unbalanced-quotes',
+            warn(msg, 'unbalanced-quotes',
                  'unbalanced %< and %> quotes')
 
         if msg.translated():
             if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
-                warn(msg,
-                     'unbalanced-quotes',
+                warn(msg, 'unbalanced-quotes',
                      'unbalanced %< and %> quotes')
 
     def lint_single_space_after_sentence():
-        """
-        After a sentence there should be two spaces.
-        """
-
         if re.search(r'[.] [A-Z]', msgid):
-            warn(msg,
-                 'single-space-after-sentence',
-                 'single space after sentence')
+            warn(msg, 'single-space-after-sentence',
+                 'sentences should be ended using two spaces')
 
     def lint_non_canonical_quotes():
         """
@@ -190,14 +200,14 @@ 
         """
         match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
         if match:
-            warn(msg,
-                 'non-canonical-quotes',
+            warn(msg, 'non-canonical-quotes',
                  f'placeholder {match.group()} should be written as %qs')
 
     lint_option_outside_quotes()
     lint_plain_apostrophe()
     lint_space_before_quote()
-    lint_underscore_outside_quotes()
+    lint_space_before_punctuation()
+    False and lint_underscore_outside_quotes()
     lint_may_not()
     lint_unbalanced_quotes()
     lint_matching_placeholders()
@@ -205,7 +215,7 @@ 
     lint_non_canonical_quotes()
 
 
-def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
+def lint_same_pattern(po: polib.POFile):
     """
     Detects messages that are structurally the same, except that they
     use different plain strings inside %<quotes%>. These messages can
@@ -230,8 +240,8 @@ 
         prev = seen[normalized]
         warn(msg,
              'same-pattern',
-             f'same pattern for {repr(msgid)} and '
-             f'{repr(prev.msgid)} in {location(prev)}',
+             f'the message {msgid!r} is structurally equal to '
+             f'{prev.msgid!r} in {location(prev)}',
              include_msgid=False)
 
 
@@ -243,23 +253,27 @@ 
             if 'gcc-internal-format' in msg.flags:
                 lint_gcc_internal_format(msg)
 
-    lint_diagnostics_differing_only_in_placeholders(po)
+    False and lint_same_pattern(po)
+
+
+def print_summary():
+    print()
+    print('summary:')
+    for entry in seen_warnings.most_common():
+        if entry[1] > 1:
+            print(f'{entry[1]:>5} {entry[0]}')
 
 
 def main():
     parser = argparse.ArgumentParser(description='')
-    parser.add_argument('file', help='pot file')
+    parser.add_argument('file', help='pot or po file')
 
     args = parser.parse_args()
 
     po = polib.pofile(args.file)
     lint_file(po)
 
-    print()
-    print('summary:')
-    for entry in seen_warnings.most_common():
-        if entry[1] > 1:
-            print(f'{entry[1]}\t{entry[0]}')
+    print_summary()
 
 
 if __name__ == '__main__':