[v2] posix: Optimize Linux posix_spawn

Message ID 20190628180529.2850-1-adhemerval.zanella@linaro.org
State New
Headers show
Series
  • [v2] posix: Optimize Linux posix_spawn
Related show

Commit Message

Adhemerval Zanella June 28, 2019, 6:05 p.m.
Since the last discusion was stalled, I am resending with a
small modification.  For default case, it is a fixed code path with
fixed stack usage in helper process, so assuming a large enough stack
buffer  it would never overflow.  It also does not prevent to adapt to
the vfork-like interface Florian has suggested, once it is implemented.

For the compat case I added the usual hardening of the guard-page,
so it incur in even less stack issues.

Changes from previous version:

  * Move the logic of stack mapping creation to stackmap.h and
    added a guard page allocation for the compatibility case.

--

The current internal posix_spawn symbol for Linux (__spawni) requires
to allocate a dynamic stack based on input arguments to handle the
SPAWN_XFLAGS_USE_PATH internal flag, which re-issue the input binary
as a shell script if execve call return ENOEXEC (to execute shell
scripts with an initial shebang).

This is done only for compatibility mode and the generic case does not
require the extra calculation plus the potential large mmap/munmap
call.  For default case, a pre-defined buffer is sufficed to use on the
clone call instead.

This patch optimizes Linux spawni by allocating a dynamic stack only
for compatibility symbol (SPAWN_XFLAGS_USE_PATH).  For generic case,
a stack allocated buffer is used instead along with a guard page,
similar to what NPTL uses for thread stacks.

Checked x86_64-linux-gnu and i686-linux-gnu.  I also ran some spawn
tests, including tst-spawn4-compat, on both ia64 and hppa to check
for regressions.

	* sysdeps/unix/sysv/linux/spawni.c (posix_spawn_args): Remove
	argc member.
	(maybe_script_execute): Remove function.
	(execve_compat, __spawni_clone, __spawnix_compat): New function.
	(__spawni_child): Remove maybe_script_execute call.
	(__spawnix): Remove magic stack slack constant with stack_slack
	identifier.
	(__spawni): Only allocates a variable stack when
	SPAWN_XFLAGS_TRY_SHELL is used.
	* posix/stackmap.h: New file.
	* sysdeps/ia64/nptl/pthreaddef.h (NEED_SEPARATE_REGISTER_STACK): Move
	to ...
	* sysdeps/ia64/stackinfo.h: ... here.
---
 posix/stackmap.h                 | 115 +++++++++++++++++
 sysdeps/ia64/nptl/pthreaddef.h   |   3 -
 sysdeps/ia64/stackinfo.h         |   3 +
 sysdeps/unix/sysv/linux/spawni.c | 208 ++++++++++++++++++-------------
 4 files changed, 237 insertions(+), 92 deletions(-)
 create mode 100644 posix/stackmap.h

-- 
2.17.1

Patch

diff --git a/posix/stackmap.h b/posix/stackmap.h
new file mode 100644
index 0000000000..ea8808d364
--- /dev/null
+++ b/posix/stackmap.h
@@ -0,0 +1,115 @@ 
+/* Functions to create stack mappings for helper processes.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _STACKMAP_H
+#define _STACKMAP_H
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <ldsodefs.h>
+#include <stdbool.h>
+
+static inline int
+stack_prot (void)
+{
+  return (PROT_READ | PROT_WRITE
+	  | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
+}
+
+static inline size_t
+stack_guard_size (void)
+{
+ return GLRO (dl_pagesize);
+}
+
+/* Return a aligning mask based on system pagesize.  */
+static inline size_t
+stack_pagesize_m1_mask (void)
+{
+  size_t pagesize_m1 = __getpagesize () - 1;
+  return ~pagesize_m1;
+}
+
+/* Return the guard page position on memory segment MEM with total size SIZE
+   and with a guard page of size GUARDIZE.  */
+static inline void *
+stack_guard_position (void *mem, size_t size, size_t guardsize)
+{
+#ifdef NEED_SEPARATE_REGISTER_STACK
+  return mem + (((size - guardsize) / 2) & stack_pagesize_m1_mask ());
+#elif _STACK_GROWS_DOWN
+  return mem;
+#elif _STACK_GROWS_UP
+  return (void *) (((uintptr_t)(mem + size)- guardsize)
+		   & stack_pagesize_m1_mask ());
+#endif
+}
+
+/* Setup the expected stack memory protection value (based on stack_prot)
+   for the memory segment MEM with size SIZE based on the guard page
+   GUARD with size GUARDSIZE.  The memory segment is expected to be allocated
+   with PROT_NOTE.  */
+static inline bool
+stack_setup_prot (char *mem, size_t size, char *guard, size_t guardsize)
+{
+  const int prot = stack_prot ();
+
+  char *guardend = guard + guardsize;
+#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
+  /* As defined at guard_position, for architectures with downward stack
+     the guard page is always at start of the allocated area.  */
+  if (__mprotect (guardend, size - guardsize, prot) != 0)
+    return false;
+#else
+  size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
+  if (__mprotect (mem, mprots1, prot) != 0)
+    return false;
+  size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
+  if (__mprotect (guardend, mprots2, prot) != 0)
+    return false;
+#endif
+  return true;
+}
+
+/* Allocated a memory segment with size SIZE plus GUARSIZE with mmap and
+   setup the expected protection for both a guard page and the stack
+   itself.  */
+static inline void *
+stack_allocate (size_t size, size_t guardsize)
+{
+  const int prot = stack_prot ();
+  
+  /* If a guard page is required, avoid committing memory by first
+     allocate with PROT_NONE and then reserve with required permission
+     excluding the guard page.  */
+  void *mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+  if (guardsize)
+    {
+      void *guard = stack_guard_position (mem, size, guardsize);
+      if (!stack_setup_prot (mem, size, guard, guardsize))
+	{
+	  __munmap (mem, size);
+	  return MAP_FAILED;
+	}
+    }
+
+  return mem;
+}
+
+#endif /* _STACKMAP_H  */
diff --git a/sysdeps/ia64/nptl/pthreaddef.h b/sysdeps/ia64/nptl/pthreaddef.h
index bf52d5af62..11579f11b4 100644
--- a/sysdeps/ia64/nptl/pthreaddef.h
+++ b/sysdeps/ia64/nptl/pthreaddef.h
@@ -18,9 +18,6 @@ 
 /* Default stack size.  */
 #define ARCH_STACK_DEFAULT_SIZE	(32 * 1024 * 1024)
 
-/* IA-64 uses a normal stack and a register stack.  */
-#define NEED_SEPARATE_REGISTER_STACK
-
 /* Required stack pointer alignment at beginning.  */
 #define STACK_ALIGN		16
 
diff --git a/sysdeps/ia64/stackinfo.h b/sysdeps/ia64/stackinfo.h
index 6433a89945..d942426fcf 100644
--- a/sysdeps/ia64/stackinfo.h
+++ b/sysdeps/ia64/stackinfo.h
@@ -30,4 +30,7 @@ 
 /* Default to a non-executable stack.  */
 #define DEFAULT_STACK_PERMS (PF_R|PF_W)
 
+/* IA-64 uses a normal stack and a register stack.  */
+#define NEED_SEPARATE_REGISTER_STACK
+
 #endif	/* stackinfo.h */
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index c1abf3f960..34063c81ba 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -30,7 +30,7 @@ 
 #include <nptl/pthreadP.h>
 #include <dl-sysdep.h>
 #include <libc-pointer-arith.h>
-#include <ldsodefs.h>
+#include <stackmap.h>
 #include "spawn_int.h"
 
 /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
@@ -74,6 +74,11 @@ 
 # define STACK(__stack, __stack_size) (__stack + __stack_size)
 #endif
 
+/* Additional stack size added on required space to execute the clone child
+   function (__spawni_child).  */
+enum {
+  stack_slack = 512
+};
 
 struct posix_spawn_args
 {
@@ -83,35 +88,39 @@  struct posix_spawn_args
   const posix_spawn_file_actions_t *fa;
   const posix_spawnattr_t *restrict attr;
   char *const *argv;
-  ptrdiff_t argc;
   char *const *envp;
   int xflags;
   int err;
 };
 
-/* Older version requires that shell script without shebang definition
-   to be called explicitly using /bin/sh (_PATH_BSHELL).  */
-static void
-maybe_script_execute (struct posix_spawn_args *args)
+/* This is compatibility function required to enable posix_spawn run
+   script without shebang definition for older posix_spawn versions
+   (2.15).  */
+static int
+execve_compat (const char *filename, char *const argv[], char *const envp[])
 {
-  if (SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
-      && (args->xflags & SPAWN_XFLAGS_TRY_SHELL) && errno == ENOEXEC)
+  __execve (filename, argv, envp);
+
+  if (errno == ENOEXEC)
     {
-      char *const *argv = args->argv;
-      ptrdiff_t argc = args->argc;
+      char *const *cargv = argv;
+      ptrdiff_t argc = 0;
+      while (cargv[argc++] != NULL);
 
       /* Construct an argument list for the shell.  */
       char *new_argv[argc + 2];
       new_argv[0] = (char *) _PATH_BSHELL;
-      new_argv[1] = (char *) args->file;
+      new_argv[1] = (char *) filename;
       if (argc > 1)
 	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
       else
 	new_argv[2] = NULL;
 
       /* Execute the shell.  */
-      args->exec (new_argv[0], new_argv, args->envp);
+      __execve (new_argv[0], new_argv, envp);
     }
+
+  return -1;
 }
 
 /* Function used in the clone call to setup the signals mask, posix_spawn
@@ -291,11 +300,6 @@  __spawni_child (void *arguments)
 
   args->exec (args->file, args->argv, args->envp);
 
-  /* This is compatibility function required to enable posix_spawn run
-     script without shebang definition for older posix_spawn versions
-     (2.15).  */
-  maybe_script_execute (args);
-
 fail:
   /* errno should have an appropriate non-zero value; otherwise,
      there's a bug in glibc or the kernel.  For lack of an error code
@@ -306,70 +310,12 @@  fail:
   _exit (SPAWN_ERROR);
 }
 
-/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
-   Before running the process perform the actions described in FILE-ACTIONS. */
 static int
-__spawnix (pid_t * pid, const char *file,
-	   const posix_spawn_file_actions_t * file_actions,
-	   const posix_spawnattr_t * attrp, char *const argv[],
-	   char *const envp[], int xflags,
-	   int (*exec) (const char *, char *const *, char *const *))
+__spawni_clone (struct posix_spawn_args *args, void *stack, size_t stack_size,
+		pid_t *pid)
 {
-  pid_t new_pid;
-  struct posix_spawn_args args;
   int ec;
-
-  /* To avoid imposing hard limits on posix_spawn{p} the total number of
-     arguments is first calculated to allocate a mmap to hold all possible
-     values.  */
-  ptrdiff_t argc = 0;
-  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
-     to be used in a execve call.  We limit to INT_MAX minus one due the
-     compatiblity code that may execute a shell script (maybe_script_execute)
-     where it will construct another argument list with an additional
-     argument.  */
-  ptrdiff_t limit = INT_MAX - 1;
-  while (argv[argc++] != NULL)
-    if (argc == limit)
-      {
-	errno = E2BIG;
-	return errno;
-      }
-
-  int prot = (PROT_READ | PROT_WRITE
-	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
-
-  /* Add a slack area for child's stack.  */
-  size_t argv_size = (argc * sizeof (void *)) + 512;
-  /* We need at least a few pages in case the compiler's stack checking is
-     enabled.  In some configs, it is known to use at least 24KiB.  We use
-     32KiB to be "safe" from anything the compiler might do.  Besides, the
-     extra pages won't actually be allocated unless they get used.  */
-  argv_size += (32 * 1024);
-  size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
-  void *stack = __mmap (NULL, stack_size, prot,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
-  if (__glibc_unlikely (stack == MAP_FAILED))
-    return errno;
-
-  /* Disable asynchronous cancellation.  */
-  int state;
-  __libc_ptf_call (__pthread_setcancelstate,
-                   (PTHREAD_CANCEL_DISABLE, &state), 0);
-
-  /* Child must set args.err to something non-negative - we rely on
-     the parent and child sharing VM.  */
-  args.err = 0;
-  args.file = file;
-  args.exec = exec;
-  args.fa = file_actions;
-  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
-  args.argv = argv;
-  args.argc = argc;
-  args.envp = envp;
-  args.xflags = xflags;
-
-  __libc_signal_block_all (&args.oldmask);
+  pid_t new_pid;
 
   /* The clone flags used will create a new child that will run in the same
      memory space (CLONE_VM) and the execution of calling thread will be
@@ -380,7 +326,7 @@  __spawnix (pid_t * pid, const char *file,
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
   new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+		   CLONE_VM | CLONE_VFORK | SIGCHLD, args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
@@ -393,7 +339,7 @@  __spawnix (pid_t * pid, const char *file,
 	 only in case of failure, so in case of premature termination
 	 due a signal args.err will remain zeroed and it will be up to
 	 caller to actually collect it.  */
-      ec = args.err;
+      ec = args->err;
       if (ec > 0)
 	/* There still an unlikely case where the child is cancelled after
 	   setting args.err, due to a positive error value.  Also there is
@@ -406,14 +352,56 @@  __spawnix (pid_t * pid, const char *file,
   else
     ec = -new_pid;
 
-  __munmap (stack, stack_size);
-
   if ((ec == 0) && (pid != NULL))
     *pid = new_pid;
 
-  __libc_signal_restore_set (&args.oldmask);
+  return ec;
+}
 
-  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
+/* Allocates a stack using mmap to call clone.  The stack size is based on
+   number of arguments since it would be used on compat mode which may call
+   execvpe/execve_compat.  */
+static int
+__spawnix_compat (struct posix_spawn_args *args, pid_t *pid)
+{
+  char *const *argv = args->argv;
+
+  /* To avoid imposing hard limits on posix_spawn{p} the total number of
+     arguments is first calculated to allocate a mmap to hold all possible
+     values.  */
+  ptrdiff_t argc = 0;
+  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
+     to be used in a execve call.  We limit to INT_MAX minus one due the
+     compatiblity code that may execute a shell script (maybe_script_execute)
+     where it will construct another argument list with an additional
+     argument.  */
+  ptrdiff_t limit = INT_MAX - 1;
+  while (argv[argc++] != NULL)
+    if (argc == limit)
+      {
+	errno = E2BIG;
+	return errno;
+      }
+
+  /* Add a slack area for child's stack.  */
+  size_t argv_size = (argc * sizeof (void *)) + stack_slack;
+
+  /* We need at least a few pages in case the compiler's stack checking is
+     enabled.  In some configs, it is known to use at least 24KiB.  We use
+     32KiB to be "safe" from anything the compiler might do.  Besides, the
+     extra pages won't actually be allocated unless they get used.  */
+  argv_size += (32 * 1024);
+
+  /* Allocate a stack with an extra guard page.  */
+  size_t stack_size = ALIGN_UP (argv_size, __getpagesize ());
+  size_t guard_size = stack_guard_size ();
+  void *stack = stack_allocate (stack_size + guard_size, guard_size);
+  if (__glibc_unlikely (stack == MAP_FAILED))
+    return errno;
+
+  int ec = __spawni_clone (args, stack, stack_size, pid);
+
+  __munmap (stack, stack_size);
 
   return ec;
 }
@@ -422,12 +410,54 @@  __spawnix (pid_t * pid, const char *file,
    Before running the process perform the actions described in FILE-ACTIONS. */
 int
 __spawni (pid_t * pid, const char *file,
-	  const posix_spawn_file_actions_t * acts,
+	  const posix_spawn_file_actions_t * file_actions,
 	  const posix_spawnattr_t * attrp, char *const argv[],
 	  char *const envp[], int xflags)
 {
-  /* It uses __execvpex to avoid run ENOEXEC in non compatibility mode (it
-     will be handled by maybe_script_execute).  */
-  return __spawnix (pid, file, acts, attrp, argv, envp, xflags,
-		    xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex :__execve);
+  /* For SPAWN_XFLAGS_TRY_SHELL we need to execute a script even without
+     a shebang.  To accomplish it we pass as callback to __spawni_child
+     __execvpe (which call maybe_script_execute for such case) or
+     execve_compat (which mimics the semantic using execve).  */
+  int (*exec) (const char *, char *const *, char *const *) =
+    xflags & SPAWN_XFLAGS_TRY_SHELL
+    ? xflags & SPAWN_XFLAGS_USE_PATH ? __execvpe  : execve_compat
+    : xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex : __execve;
+
+  /* Child must set args.err to something non-negative - we rely on
+     the parent and child sharing VM.  */
+  struct posix_spawn_args args = {
+    .err = 0,
+    .file = file,
+    .exec = exec,
+    .fa = file_actions,
+    .attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 },
+    .argv = argv,
+    .envp = envp,
+    .xflags = xflags
+  };
+
+  /* Disable asynchronous cancellation.  */
+  int state;
+  __libc_ptf_call (__pthread_setcancelstate,
+                   (PTHREAD_CANCEL_DISABLE, &state), 0);
+
+  __libc_signal_block_all (&args.oldmask);
+
+  int ec;
+
+  if (__glibc_likely ((xflags & SPAWN_XFLAGS_TRY_SHELL) == 0))
+    {
+      /* execvpe allocates at least (NAME_MAX + 1) + PATH_MAX to create the
+	 combination of PATH entry and program name.  */
+      char stack[(NAME_MAX + 1) + PATH_MAX + stack_slack];
+      ec = __spawni_clone (&args, stack, sizeof stack, pid);
+    }
+  else
+    ec = __spawnix_compat (&args, pid);
+
+  __libc_signal_restore_set (&args.oldmask);
+
+  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
+
+  return ec;
 }