[15/32] new C++ lexer

Message ID 7a603e9d-0760-9aa8-11f0-7f8788736909@acm.org
State New
Headers show
Series
  • C++ 20 Modules
Related show

Commit Message

Nathan Sidwell Nov. 3, 2020, 9:15 p.m.
Importation of header-units requires the tokenizer to recognize such 
imports during tokenization so that their macros become available.  The 
C++ parser currently (and continues to) tokenize the entire file before 
beginning c++ parsing.

This implements an explicit coroutine to manage that recognition.  It is 
used both for C++ parsing proper and for just preprocessing.  When a 
module-significant control line is observed, we call into the module 
machinery to handle it.  Usually, we'll also call again later when 
parsing that declaration.


-- 
Nathan Sidwell

Patch

diff --git c/gcc/cp/lex.c w/gcc/cp/lex.c
index 8a69bc4f170..013cbadf625 100644
--- c/gcc/cp/lex.c
+++ w/gcc/cp/lex.c
@@ -32,6 +32,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "c-family/c-objc.h"
 #include "gcc-rich-location.h"
 #include "cp-name-hint.h"
+#include "langhooks.h"
 
 static int interface_strcmp (const char *);
 static void init_cp_pragma (void);
@@ -378,7 +381,206 @@  interface_strcmp (const char* s)
   return 1;
 }
 
-
+/* We've just read a cpp-token, figure out our next state.  Hey, this
+   is a hand-coded co-routine!  */
+
+struct token_coro
+{
+  enum state
+  {
+   idle,
+   module_first,
+   module_cont,
+   module_end,
+  };
+
+  enum state state : 8;
+  bool is_import : 1;
+  bool got_export : 1;
+  bool got_colon : 1;
+  bool want_dot : 1;
+
+  location_t token_loc;
+  cpp_reader *reader;
+  module_state *module;
+  module_state *import;
+
+  token_coro (cpp_reader *reader)
+    : state (idle), is_import (false),
+    got_export (false), got_colon (false), want_dot (false),
+    token_loc (UNKNOWN_LOCATION),
+    reader (reader), module (NULL), import (NULL)
+  {
+  };
+
+  /* Process the next token.  Note we cannot see CPP_EOF inside a
+     pragma -- a CPP_PRAGMA_EOL always happens.  */
+  uintptr_t resume (int type, int keyword, tree value, location_t loc)
+  {
+    unsigned res = 0;
+
+    switch (state)
+      {
+      case idle:
+	if (type == CPP_KEYWORD)
+	  switch (keyword)
+	    {
+	    default:
+	      break;
+
+	    case RID__EXPORT:
+	      got_export = true;
+	      res = lang_hooks::PT_begin_pragma;
+	      break;
+
+	    case RID__IMPORT:
+	      is_import = true;
+	      /* FALLTHRU */
+	    case RID__MODULE:
+	      state = module_first;
+	      want_dot = false;
+	      got_colon = false;
+	      token_loc = loc;
+	      import = NULL;
+	      if (!got_export)
+		res = lang_hooks::PT_begin_pragma;
+	      break;
+	    }
+	break;
+
+      case module_first:
+	if (is_import && type == CPP_HEADER_NAME)
+	  {
+	    /* A header name.  The preprocessor will have already
+	       done include searching and canonicalization.  */
+	    state = module_end;
+	    goto header_unit;
+	  }
+	
+	if (type == CPP_PADDING || type == CPP_COMMENT)
+	  break;
+
+	state = module_cont;
+	if (type == CPP_COLON && module)
+	  {
+	    got_colon = true;
+	    import = module;
+	    break;
+	  }
+	/* FALLTHROUGH  */
+
+      case module_cont:
+	switch (type)
+	  {
+	  case CPP_PADDING:
+	  case CPP_COMMENT:
+	    break;
+
+	  default:
+	    /* If we ever need to pay attention to attributes for
+	       header modules, more logic will be needed.  */
+	    state = module_end;
+	    break;
+
+	  case CPP_COLON:
+	    if (got_colon)
+	      state = module_end;
+	    got_colon = true;
+	    /* FALLTHROUGH  */
+	  case CPP_DOT:
+	    if (!want_dot)
+	      state = module_end;
+	    want_dot = false;
+	    break;
+
+	  case CPP_PRAGMA_EOL:
+	    goto module_end;
+
+	  case CPP_NAME:
+	    if (want_dot)
+	      {
+		/* Got name instead of [.:].  */
+		state = module_end;
+		break;
+	      }
+	  header_unit:
+	    import = get_module (value, import, got_colon);
+	    want_dot = true;
+	    break;
+	  }
+	break;
+
+      case module_end:
+	if (type == CPP_PRAGMA_EOL)
+	  {
+	  module_end:;
+	    /* End of the directive, handle the name.  */
+	    if (import)
+	      if (module_state *m
+		  = preprocess_module (import, token_loc, module != NULL,
+				       is_import, got_export, reader))
+		if (!module)
+		  module = m;
+
+	    is_import = got_export = false;
+	    state = idle;
+	  }
+	break;
+      }
+
+    return res;
+  }
+};
+
+/* Initialize or teardown.  */
+
+uintptr_t
+module_token_cdtor (cpp_reader *pfile, uintptr_t data_)
+{
+  if (token_coro *coro = reinterpret_cast<token_coro *> (data_))
+    {
+      preprocessed_module (pfile);
+      delete coro;
+      data_ = 0;
+    }
+  else if (modules_p ())
+    data_ = reinterpret_cast <uintptr_t > (new token_coro (pfile));
+
+  return data_;
+}
+
+uintptr_t
+module_token_lang (int type, int keyword, tree value, location_t loc,
+		   uintptr_t data_)
+{
+  token_coro *coro = reinterpret_cast <token_coro *> (data_);
+  return coro->resume (type, keyword, value, loc);
+}
+
+uintptr_t
+module_token_pre (cpp_reader *pfile, const cpp_token *tok, uintptr_t data_)
+{
+  if (!tok)
+    return module_token_cdtor (pfile, data_);
+
+  int type = tok->type;
+  int keyword = RID_MAX;
+  tree value = NULL_TREE;
+
+  if (tok->type == CPP_NAME)
+    {
+      value = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node.node));
+      if (IDENTIFIER_KEYWORD_P (value))
+	{
+	  keyword = C_RID_CODE (value);
+	  type = CPP_KEYWORD;
+	}
+    }
+  else if (tok->type == CPP_HEADER_NAME)
+    value = build_string (tok->val.str.len, (const char *)tok->val.str.text);
+
+  return module_token_lang (type, keyword, value, tok->src_loc, data_);
+}
 
 /* Parse a #pragma whose sole argument is a string constant.
    If OPT is true, the argument is optional.  */
diff --git c/gcc/cp/cp-tree.h w/gcc/cp/cp-tree.h
index fdb8ee57f0b..e8e4d0af2d8 100644
--- c/gcc/cp/cp-tree.h
+++ w/gcc/cp/cp-tree.h
@@ -6742,6 +6940,10 @@  extern void set_identifier_kind			(tree, cp_identifier_kind);
 extern bool cxx_init				(void);
 extern void cxx_finish				(void);
 extern bool in_main_input_context		(void);
+extern uintptr_t module_token_pre (cpp_reader *, const cpp_token *, uintptr_t);
+extern uintptr_t module_token_cdtor (cpp_reader *, uintptr_t);
+extern uintptr_t module_token_lang (int type, int keyword, tree value,
+				    location_t, uintptr_t);
 
 /* in method.c */
 extern void init_method				(void);