[libgomp,WIP,GSoC'19] Modification to a single queue, single execution path.

Message ID df84a08af3a19bfc76c2768046fb5a53@cweb001.nm.nfra.io
State New
Headers show
Series
  • [libgomp,WIP,GSoC'19] Modification to a single queue, single execution path.
Related show

Commit Message

=?utf-8?B?6rmA6rec656Y?= July 21, 2019, 7:40 p.m.
Finished unifying the three queues to team->task_queue.
All the tests passed except some unsupported target tests.

Comments

Jakub Jelinek July 24, 2019, 7:15 a.m. | #1
Hi!

Thanks for the patch.

On Mon, Jul 22, 2019 at 04:40:21AM +0900, 김규래 wrote:

Can you please try to tweak your mailer settings?  In the text version of
the patch the mailer ate tab characters, so the patch can't apply, and in
the html version which we generally don't want to see on the mailing list,
one has to open it in some web browser, copy from there and replace \n\n
with \n to get something that can apply.

> @@ -447,7 +451,7 @@ struct gomp_task

>    /* Parent of this task.  */

>    struct gomp_task *parent;

>    /* Children of this task.  */

> -  struct priority_queue children_queue;

> +  /* struct priority_queue children_queue; */


Generally, we don't want to have code commented out like this in the final
patch submission.  For this WIP, I think it is acceptable, as I think in the
end you don't want to use the team's queue, but actually either
children_queue (renamed), but only use it on the implicit tasks, or during
team creation allocate together with the team structure also memory that
would be used as a trailing array for an array of the implicit queues next
to the array of implicit tasks.

>    /* The priority node for this task in each of the different queues.

>       We put this here to avoid allocating space for each priority

>       node.  Then we play offsetof() games to convert between pnode[]

>       entries and the gomp_task in which they reside.  */

> -  struct priority_node pnode[3];

> +  struct priority_node pnode;


If there is just one priority_node, the above comment can go,

> @@ -1211,7 +1218,7 @@ extern int gomp_test_nest_lock_25 (omp_nest_lock_25_t *) __GOMP_NOTHROW;

>  static inline size_t

>  priority_queue_offset (enum priority_queue_type type)

>  {

> -  return offsetof (struct gomp_task, pnode[(int) type]);

> +  return offsetof (struct gomp_task, pnode);


this whole routine as well and we just don't need offsetof anymore, nor
priority_queue_type anywhere, PQ_*, etc., just use the single pnode.

> @@ -182,8 +128,8 @@ gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,

>      }

>    else

>      {

> -      ndepend = (uintptr_t) depend[1]; /* total # */

> -      size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */

> +      ndepend = (uintptr_t) depend[1];      /* total # */

> +      size_t nout = (uintptr_t) depend[2];    /* # of out: and inout: */


The patch contains a lot of changes like the above one, a small portion
improves formatting, but most of them make it worse or are unnecessary.
The above one seems unnecessary (what is wrong is just space followed by
tab, that should never happen).

> @@ -235,8 +181,8 @@ gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,

>        task->depend[i].redundant = false;

>        task->depend[i].redundant_out = false;

>  

> -      hash_entry_type *slot = htab_find_slot (&parent->depend_hash,

> -       &task->depend[i], INSERT);

> +      hash_entry_type *slot

> + = htab_find_slot (&parent->depend_hash, &task->depend[i], INSERT);

>        hash_entry_type out = NULL, last = NULL;

>        if (*slot)


This change is correct formatting-wise, but the old one was correct too, so
there is no reason to change it like that.

>   {

> @@ -282,13 +228,12 @@ gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,

>   continue;

>         else if (tsk->dependers->n_elem == tsk->dependers->allocated)

>   {

> -   tsk->dependers->allocated

> -     = tsk->dependers->allocated * 2 + 2;

> +   tsk->dependers->allocated = tsk->dependers->allocated * 2 + 2;


This one is wrong, at the line is already too long after the change by 1
character.

>     tsk->dependers

>       = gomp_realloc (tsk->dependers,

>       sizeof (struct gomp_dependers_vec)

> -     + (tsk->dependers->allocated

> -        * sizeof (struct gomp_task *)));

> +       + (tsk->dependers->allocated

> + * sizeof (struct gomp_task *)));


The old formatting was good, the new one is not.
>   }

>         tsk->dependers->elem[tsk->dependers->n_elem++] = task;

>         task->num_dependees++;

> @@ -371,8 +316,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),

>   {

>     if (thr->task->taskgroup->cancelled)

>       return;

> -   if (thr->task->taskgroup->workshare

> -       && thr->task->taskgroup->prev

> +   if (thr->task->taskgroup->workshare && thr->task->taskgroup->prev

>         && thr->task->taskgroup->prev->cancelled)


Again.  The coding conventions have a rule that if the whole condition
fits on one line, then it should be on one line, but if it doesn't, each
&& or || operand should go on a separate line (I know, there are several
spots where the code doesn't honor it, but this one wasn't the case).

>       return;

>   }

> @@ -383,8 +327,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),

>    else if (priority > gomp_max_task_priority_var)

>      priority = gomp_max_task_priority_var;

>  

> -  if (!if_clause || team == NULL

> -      || (thr->task && thr->task->final_task)

> +  if (!if_clause || team == NULL || (thr->task && thr->task->final_task)

>        || team->task_count > 64 * team->nthreads)


Like here.  So if you wanted to change the formatting, the right change
would be to move || team == NULL on a separate line.

> @@ -429,12 +372,6 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),

>   child thread, but seeing a stale non-NULL value is not a

>   problem.  Once past the task_lock acquisition, this thread

>   will see the real value of task.children.  */


The comment will need to go or be adjusted with this change eventually.

> @@ -449,8 +386,8 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),

>        if (flags & GOMP_TASK_FLAG_DEPEND)

>   depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])

>          * sizeof (struct gomp_task_depend_entry));

> -      task = gomp_malloc (sizeof (*task) + depend_size

> -   + arg_size + arg_align - 1);

> +      task

> + = gomp_malloc (sizeof (*task) + depend_size + arg_size + arg_align - 1);


Too long again.

> -ialias (GOMP_taskgroup_start)

> -ialias (GOMP_taskgroup_end)

> -ialias (GOMP_taskgroup_reduction_register)

> +ialias (GOMP_taskgroup_start) ialias (GOMP_taskgroup_end)

> +  ialias (GOMP_taskgroup_reduction_register)


You don't want multiple ialiases on the same line, nor the indentation.

> @@ -563,10 +486,9 @@ ialias (GOMP_taskgroup_reduction_register)

>  #undef UTYPE

>  #undef GOMP_taskloop

>  

> -static void inline

> -priority_queue_move_task_first (enum priority_queue_type type,

> - struct priority_queue *head,

> - struct gomp_task *task)

> +    static void inline priority_queue_move_task_first (

> +      enum priority_queue_type type, struct priority_queue *head,

> +      struct gomp_task *task)


The coding conventions say the previous formatting was correct, the return
type etc. go on one line, but the function name starts a new line,
( at the end of line is highly undesirable.

>    task->kind = GOMP_TASK_WAITING;

> -  if (parent && parent->taskwait)

> +  if (parent)

>      {

> -      if (parent->taskwait->in_taskwait)

> +      if (parent->taskwait)


Why this change?  parent && parent->taskwait needs smaller indentation,
and I don't see you adding say else code for the parent->taskwait
case.

> @@ -674,10 +590,10 @@ static void gomp_task_run_post_handle_depend_hash (struct gomp_task *);

>  /* Called for nowait target tasks.  */

>  

>  bool

> -gomp_create_target_task (struct gomp_device_descr *devicep,

> - void (*fn) (void *), size_t mapnum, void **hostaddrs,

> - size_t *sizes, unsigned short *kinds,

> - unsigned int flags, void **depend, void **args,

> +gomp_create_target_task (struct gomp_device_descr *devicep, void (*fn) (void *),


Too long line.

> -      && --parent->taskwait->n_depend == 0

> -      && parent->taskwait->in_depend_wait)

> +      && --parent->taskwait->n_depend == 0 && parent->taskwait->in_depend_wait)

>      {

>        parent->taskwait->in_depend_wait = false;

>        gomp_sem_post (&parent->taskwait->taskwait_sem);

>      }


> + /*  Previously, the dependencies were upgraded their priorities.

> +     I'm not sure if not upgrading the depedencies will not lead

> +     to a possible deadlock in a single queue situation. */


For comment formatting, we do not want to start with two spaces after /*,
but want to have two spaces after full stop, even at the end, so
/* Previously ...
   I'm not ...
   ... situation.  */
>  

> - finish:

> +finish:


Labels are to be indented by one fewer columns compared to what follows,
so the old one was correct.
> @@ -2095,7 +1618,7 @@ gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,

>        to hash also on the first sizeof (uintptr_t) bytes which contain

>        a pointer.  Hide the cast from the compiler.  */

>     hash_entry_type n;

> -   __asm ("" : "=g" (n) : "0" (p));

> +   __asm("" : "=g"(n) : "0"(p));


The spaces before ( are correct in all 3 spots.

> @@ -2192,14 +1715,13 @@ GOMP_taskgroup_reduction_unregister (uintptr_t *data)

>  }

>  ialias (GOMP_taskgroup_reduction_unregister)

>  

> -/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the

> -   original list item or address of previously remapped original list

> -   item to address of the private copy, store that to ptrs[i].

> -   For i < cntorig, additionally set ptrs[cnt+i] to the address of

> -   the original list item.  */

> +  /* For i = 0 to cnt-1, remap ptrs[i] which is either address of the

> +     original list item or address of previously remapped original list

> +     item to address of the private copy, store that to ptrs[i].

> +     For i < cntorig, additionally set ptrs[cnt+i] to the address of

> +     the original list item.  */


Function comments should have /* at the start of the line, not indented
further.

>   gomp_fatal ("couldn't find matching task_reduction or reduction with "

> -     "task modifier for %p", ptrs[i]);

> +     "task modifier for %p",

> +     ptrs[i]);


No need to put the argument on a separate line.

As mentioned earlier, I think the dependency handling could be guarded by a
lock in the task whose parent_hash etc. is going to be used, for the
per-implicit task queues see above and it could again use a lock in the
implicit task whose queue will be used.

	Jakub
=?utf-8?B?6rmA6rec656Y?= July 24, 2019, 8:25 a.m. | #2
Hi Jakub, 
thanks for your detailed comments.
 
> Can you please try to tweak your mailer settings?  In the text version of
> the patch the mailer ate tab characters, so the patch can't apply, and in
> the html version which we generally don't want to see on the mailing list,
> one has to open it in some web browser, copy from there and replace \n\n
> with \n to get something that can apply.
 
I blindly ran clang-format before submission,
I'll try to comply with the formatting standards by hand next time.
And about the mailer, I think copy-pasting through a text editor ate all the tabs.
I'll try to do something about my mailer.
Sorry for the inconvenience.
 
> Generally, we don't want to have code commented out like this in the final
> patch submission.  For this WIP, I think it is acceptable, as I think in the
> end you don't want to use the team's queue, but actually either
> children_queue (renamed), but only use it on the implicit tasks, or during
 
Can you elaborate?
What do you mean by "children_queue (renamed)"?

> team creation allocate together with the team structure also memory that
> would be used as a trailing array for an array of the implicit queues next
> to the array of implicit tasks. 
 
Do you mean to make two trailing arrays in gomp_team?
Also, this is a personal question, why do gcc prefer trailing arrays over dynamically allocated pointers?
 
Thank you again for your detailed comments,
I'll do my best for the rest of GSoC
Ray Kim
Jakub Jelinek July 24, 2019, 9:15 p.m. | #3
On Wed, Jul 24, 2019 at 05:25:56PM +0900, 김규래 wrote:
> > Generally, we don't want to have code commented out like this in the final

> > patch submission.  For this WIP, I think it is acceptable, as I think in the

> > end you don't want to use the team's queue, but actually either

> > children_queue (renamed), but only use it on the implicit tasks, or during

>  

> Can you elaborate?

> What do you mean by "children_queue (renamed)"?


I meant use a queue structure member in the same location as current
children_queue, just rename it to something more sensible (just task_queue
etc.), because it will not be really a queue of task children anymore.

> > team creation allocate together with the team structure also memory that

> > would be used as a trailing array for an array of the implicit queues next

> > to the array of implicit tasks. 

>  

> Do you mean to make two trailing arrays in gomp_team?


Yes.  Of course, in C you can't have two flexible array members after each
other, and we certainly don't want to use the GNU C extension of variable
length structures, but it would be something like struct gomp_team have
a struct priority_queue *task_queues; where the struct gomp_team
initialization would set team->task_queues to (struct priority_queue *)
&team->implicit_task[team->nthreads].

> Also, this is a personal question, why do gcc prefer trailing arrays over dynamically allocated pointers?


Because malloc is fairly expensive and lots of benchmarks etc. care about
#pragma omp parallel latency (when the threads are already around, of course
the first one that needs to pthread_create is much more expensive).  So it
makes quite noticeable difference if you allocate one allocation or 3 or
more.

	Jakub

Patch

=== libgomp Summary ===
 
# of expected passes 6749
# of expected failures 4
# of unsupported tests 349
 
I also tried to make taskwait_end, taskgroup_end, maybe_wait_for_dependencies share the same execution routines,
The current state of that is pretty rough.
I think there are too many mutex lock and unlocks.
I haven't tested the performance of this patch I'll try that soon for the sake of curiosity.
I'll try to reduce the locked regions and then split the queues into a multiqueue.
 
2019-07-22  Khu-rai Kim  <msca8h@naver.com>
 
* libgomp/libgomp.h: Removed task->children_queue,
taskgroup->children_queue, added children counter for taskgroup. 
Added a new task kind, GOMP_DONE to track the lifecycle of 
dangling parents.
* libgomp/task.c: Unified all queue to team->task_queue. 
taskwait_end, taskgroup_end, maybe_wait_for_dependencies share 
the same task execution routine. Parents finished executing with
remaining children are kept until all their children are done 
executing.
* libgomp/taskloop.c: Unified all queue to team->task_queue.
 
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 9f433160ab5..3a615f1d9af 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -405,7 +405,11 @@  enum gomp_task_kind
      but not yet completed.  Once that completes, they will be readded
      into the queues as GOMP_TASK_WAITING in order to perform the var
      unmapping.  */
-  GOMP_TASK_ASYNC_RUNNING
+  GOMP_TASK_ASYNC_RUNNING,
+
+  /* The task is left only for dependency tracking purpose
+     and is ready to be freed anytime. */
+  GOMP_DONE
 };
 
 struct gomp_task_depend_entry
@@ -447,7 +451,7 @@  struct gomp_task
   /* Parent of this task.  */
   struct gomp_task *parent;
   /* Children of this task.  */
-  struct priority_queue children_queue;
+  /* struct priority_queue children_queue; */
   /* Taskgroup this task belongs in.  */
   struct gomp_taskgroup *taskgroup;
   /* Tasks that depend on this task.  */
@@ -461,13 +465,16 @@  struct gomp_task
      into the various queues to be scheduled.  */
   size_t num_dependees;
 
+  /* Number of childrens created and queued from this task. */
+  size_t num_children;
+
   /* Priority of this task.  */
   int priority;
   /* The priority node for this task in each of the different queues.
      We put this here to avoid allocating space for each priority
      node.  Then we play offsetof() games to convert between pnode[]
      entries and the gomp_task in which they reside.  */
-  struct priority_node pnode[3];
+  struct priority_node pnode;
 
   struct gomp_task_icv icv;
   void (*fn) (void *);
@@ -491,7 +498,7 @@  struct gomp_taskgroup
 {
   struct gomp_taskgroup *prev;
   /* Queue of tasks that belong in this taskgroup.  */
-  struct priority_queue taskgroup_queue;
+  /* struct priority_queue taskgroup_queue; */
   uintptr_t *reductions;
   bool in_taskgroup_wait;
   bool cancelled;
@@ -1211,7 +1218,7 @@  extern int gomp_test_nest_lock_25 (omp_nest_lock_25_t *) __GOMP_NOTHROW;
 static inline size_t
 priority_queue_offset (enum priority_queue_type type)
 {
-  return offsetof (struct gomp_task, pnode[(int) type]);
+  return offsetof (struct gomp_task, pnode);
 }
 
 /* Return the task associated with a priority NODE of type TYPE.  */
diff --git a/libgomp/task.c b/libgomp/task.c
index 15177ac8824..df822526c3f 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -81,11 +81,12 @@  gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
   task->final_task = false;
   task->copy_ctors_done = false;
   task->parent_depends_on = false;
-  priority_queue_init (&task->children_queue);
+  // priority_queue_init (&task->children_queue);
   task->taskgroup = NULL;
   task->dependers = NULL;
   task->depend_hash = NULL;
   task->depend_count = 0;
+  task->num_children = 0;
 }
 
 /* Clean up a task, after completing it.  */
@@ -100,61 +101,6 @@  gomp_end_task (void)
   thr->task = task->parent;
 }
 
-/* Clear the parent field of every task in LIST.  */
-
-static inline void
-gomp_clear_parent_in_list (struct priority_list *list)
-{
-  struct priority_node *p = list->tasks;
-  if (p)
-    do
-      {
- priority_node_to_task (PQ_CHILDREN, p)->parent = NULL;
- p = p->next;
-      }
-    while (p != list->tasks);
-}
-
-/* Splay tree version of gomp_clear_parent_in_list.
-
-   Clear the parent field of every task in NODE within SP, and free
-   the node when done.  */
-
-static void
-gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node)
-{
-  if (!node)
-    return;
-  prio_splay_tree_node left = node->left, right = node->right;
-  gomp_clear_parent_in_list (&node->key.l);
-#if _LIBGOMP_CHECKING_
-  memset (node, 0xaf, sizeof (*node));
-#endif
-  /* No need to remove the node from the tree.  We're nuking
-     everything, so just free the nodes and our caller can clear the
-     entire splay tree.  */
-  free (node);
-  gomp_clear_parent_in_tree (sp, left);
-  gomp_clear_parent_in_tree (sp, right);
-}
-
-/* Clear the parent field of every task in Q and remove every task
-   from Q.  */
-
-static inline void
-gomp_clear_parent (struct priority_queue *q)
-{
-  if (priority_queue_multi_p (q))
-    {
-      gomp_clear_parent_in_tree (&q->t, q->t.root);
-      /* All the nodes have been cleared in gomp_clear_parent_in_tree.
- No need to remove anything.  We can just nuke everything.  */
-      q->t.root = NULL;
-    }
-  else
-    gomp_clear_parent_in_list (&q->l);
-}
-
 /* Helper function for GOMP_task and gomp_create_target_task.
 
    For a TASK with in/out dependencies, fill in the various dependency
@@ -182,8 +128,8 @@  gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
     }
   else
     {
-      ndepend = (uintptr_t) depend[1]; /* total # */
-      size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */
+      ndepend = (uintptr_t) depend[1];      /* total # */
+      size_t nout = (uintptr_t) depend[2];    /* # of out: and inout: */
       size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */
       /* For now we treat mutexinoutset like out, which is compliant, but
  inefficient.  */
@@ -235,8 +181,8 @@  gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
       task->depend[i].redundant = false;
       task->depend[i].redundant_out = false;
 
-      hash_entry_type *slot = htab_find_slot (&parent->depend_hash,
-       &task->depend[i], INSERT);
+      hash_entry_type *slot
+ = htab_find_slot (&parent->depend_hash, &task->depend[i], INSERT);
       hash_entry_type out = NULL, last = NULL;
       if (*slot)
  {
@@ -282,13 +228,12 @@  gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
  continue;
        else if (tsk->dependers->n_elem == tsk->dependers->allocated)
  {
-   tsk->dependers->allocated
-     = tsk->dependers->allocated * 2 + 2;
+   tsk->dependers->allocated = tsk->dependers->allocated * 2 + 2;
    tsk->dependers
      = gomp_realloc (tsk->dependers,
      sizeof (struct gomp_dependers_vec)
-     + (tsk->dependers->allocated
-        * sizeof (struct gomp_task *)));
+       + (tsk->dependers->allocated
+ * sizeof (struct gomp_task *)));
  }
        tsk->dependers->elem[tsk->dependers->n_elem++] = task;
        task->num_dependees++;
@@ -371,8 +316,7 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
  {
    if (thr->task->taskgroup->cancelled)
      return;
-   if (thr->task->taskgroup->workshare
-       && thr->task->taskgroup->prev
+   if (thr->task->taskgroup->workshare && thr->task->taskgroup->prev
        && thr->task->taskgroup->prev->cancelled)
      return;
  }
@@ -383,8 +327,7 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
   else if (priority > gomp_max_task_priority_var)
     priority = gomp_max_task_priority_var;
 
-  if (!if_clause || team == NULL
-      || (thr->task && thr->task->final_task)
+  if (!if_clause || team == NULL || (thr->task && thr->task->final_task)
       || team->task_count > 64 * team->nthreads)
     {
       struct gomp_task task;
@@ -395,8 +338,8 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
  depend clauses for non-deferred tasks other than this, because
  the parent task is suspended until the child task finishes and thus
  it can't start further child tasks.  */
-      if ((flags & GOMP_TASK_FLAG_DEPEND)
-   && thr->task && thr->task->depend_hash)
+      if ((flags & GOMP_TASK_FLAG_DEPEND) && thr->task
+   && thr->task->depend_hash)
  gomp_task_maybe_wait_for_dependencies (depend);
 
       gomp_init_task (&task, thr->task, gomp_icv (false));
@@ -429,12 +372,6 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
  child thread, but seeing a stale non-NULL value is not a
  problem.  Once past the task_lock acquisition, this thread
  will see the real value of task.children.  */
-      if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED))
- {
-   gomp_mutex_lock (&team->task_lock);
-   gomp_clear_parent (&task.children_queue);
-   gomp_mutex_unlock (&team->task_lock);
- }
       gomp_end_task ();
     }
   else
@@ -449,8 +386,8 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       if (flags & GOMP_TASK_FLAG_DEPEND)
  depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])
         * sizeof (struct gomp_task_depend_entry));
-      task = gomp_malloc (sizeof (*task) + depend_size
-   + arg_size + arg_align - 1);
+      task
+ = gomp_malloc (sizeof (*task) + depend_size + arg_size + arg_align - 1);
       arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
        & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
@@ -474,8 +411,7 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
  tasks.  */
-      if (__builtin_expect (gomp_cancel_var, 0)
-   && !task->copy_ctors_done)
+      if (__builtin_expect (gomp_cancel_var, 0) && !task->copy_ctors_done)
  {
    if (gomp_team_barrier_cancelled (&team->barrier))
      {
@@ -489,14 +425,17 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
      {
        if (taskgroup->cancelled)
  goto do_cancel;
-       if (taskgroup->workshare
-   && taskgroup->prev
+       if (taskgroup->workshare && taskgroup->prev
    && taskgroup->prev->cancelled)
  goto do_cancel;
      }
  }
+
       if (taskgroup)
- taskgroup->num_children++;
+ ++taskgroup->num_children;
+      ++parent->num_children;
+      ++team->task_count;
+
       if (depend_size)
  {
    gomp_task_handle_depend (task, parent, depend);
@@ -514,38 +453,22 @@  GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
      }
  }
 
-      priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
-      task, priority,
-      PRIORITY_INSERT_BEGIN,
-      /*adjust_parent_depends_on=*/false,
-      task->parent_depends_on);
-      if (taskgroup)
- priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-        task, priority,
-        PRIORITY_INSERT_BEGIN,
-        /*adjust_parent_depends_on=*/false,
-        task->parent_depends_on);
-
-      priority_queue_insert (PQ_TEAM, &team->task_queue,
-      task, priority,
+      priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority,
       PRIORITY_INSERT_END,
       /*adjust_parent_depends_on=*/false,
       task->parent_depends_on);
-
-      ++team->task_count;
       ++team->task_queued_count;
       gomp_team_barrier_set_task_pending (&team->barrier);
-      do_wake = team->task_running_count + !parent->in_tied_task
- < team->nthreads;
+      do_wake
+ = team->task_running_count + !parent->in_tied_task < team->nthreads;
       gomp_mutex_unlock (&team->task_lock);
       if (do_wake)
  gomp_team_barrier_wake (&team->barrier, 1);
     }
 }
 
-ialias (GOMP_taskgroup_start)
-ialias (GOMP_taskgroup_end)
-ialias (GOMP_taskgroup_reduction_register)
+ialias (GOMP_taskgroup_start) ialias (GOMP_taskgroup_end)
+  ialias (GOMP_taskgroup_reduction_register)
 
 #define TYPE long
 #define UTYPE unsigned long
@@ -563,10 +486,9 @@  ialias (GOMP_taskgroup_reduction_register)
 #undef UTYPE
 #undef GOMP_taskloop
 
-static void inline
-priority_queue_move_task_first (enum priority_queue_type type,
- struct priority_queue *head,
- struct gomp_task *task)
+    static void inline priority_queue_move_task_first (
+      enum priority_queue_type type, struct priority_queue *head,
+      struct gomp_task *task)
 {
 #if _LIBGOMP_CHECKING_
   if (!priority_queue_task_in_queue_p (type, head, task))
@@ -584,9 +506,8 @@  priority_queue_move_task_first (enum priority_queue_type type,
   else
     list = &head->l;
   priority_list_remove (list, task_to_priority_node (type, task), 0);
-  priority_list_insert (type, list, task, task->priority,
- PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
- task->parent_depends_on);
+  priority_list_insert (type, list, task, task->priority, PRIORITY_INSERT_BEGIN,
+ type == PQ_CHILDREN, task->parent_depends_on);
 }
 
 /* Actual body of GOMP_PLUGIN_target_task_completion that is executed
@@ -598,45 +519,40 @@  static void
 gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
 {
   struct gomp_task *parent = task->parent;
-  if (parent)
-    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
-     task);
-
   struct gomp_taskgroup *taskgroup = task->taskgroup;
-  if (taskgroup)
-    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-     task);
-
   priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
- PRIORITY_INSERT_BEGIN, false,
- task->parent_depends_on);
+ PRIORITY_INSERT_BEGIN, false, task->parent_depends_on);
   task->kind = GOMP_TASK_WAITING;
-  if (parent && parent->taskwait)
+  if (parent)
     {
-      if (parent->taskwait->in_taskwait)
+      if (parent->taskwait)
  {
-   /* One more task has had its dependencies met.
-      Inform any waiters.  */
-   parent->taskwait->in_taskwait = false;
-   gomp_sem_post (&parent->taskwait->taskwait_sem);
+   if (parent->taskwait->in_taskwait)
+     {
+       /* One more task has had its dependencies met.
+ Inform any waiters.  */
+       parent->taskwait->in_taskwait = false;
+       gomp_sem_post (&parent->taskwait->taskwait_sem);
+     }
+   else if (parent->taskwait->in_depend_wait)
+     {
+       /* One more task has had its dependencies met.
+ Inform any waiters.  */
+       parent->taskwait->in_depend_wait = false;
+       gomp_sem_post (&parent->taskwait->taskwait_sem);
+     }
  }
-      else if (parent->taskwait->in_depend_wait)
+    }
+  if (taskgroup)
+    {
+      if (taskgroup->in_taskgroup_wait)
  {
    /* One more task has had its dependencies met.
       Inform any waiters.  */
-   parent->taskwait->in_depend_wait = false;
-   gomp_sem_post (&parent->taskwait->taskwait_sem);
+   taskgroup->in_taskgroup_wait = false;
+   gomp_sem_post (&taskgroup->taskgroup_sem);
  }
     }
-  if (taskgroup && taskgroup->in_taskgroup_wait)
-    {
-      /* One more task has had its dependencies met.
- Inform any waiters.  */
-      taskgroup->in_taskgroup_wait = false;
-      gomp_sem_post (&taskgroup->taskgroup_sem);
-    }
-
-  ++team->task_queued_count;
   gomp_team_barrier_set_task_pending (&team->barrier);
   /* I'm afraid this can't be done after releasing team->task_lock,
      as gomp_target_task_completion is run from unrelated thread and
@@ -674,10 +590,10 @@  static void gomp_task_run_post_handle_depend_hash (struct gomp_task *);
 /* Called for nowait target tasks.  */
 
 bool
-gomp_create_target_task (struct gomp_device_descr *devicep,
- void (*fn) (void *), size_t mapnum, void **hostaddrs,
- size_t *sizes, unsigned short *kinds,
- unsigned int flags, void **depend, void **args,
+gomp_create_target_task (struct gomp_device_descr *devicep, void (*fn) (void *),
+ size_t mapnum, void **hostaddrs, size_t *sizes,
+ unsigned short *kinds, unsigned int flags,
+ void **depend, void **args,
  enum gomp_target_task_state state)
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -692,8 +608,7 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
  {
    if (thr->task->taskgroup->cancelled)
      return true;
-   if (thr->task->taskgroup->workshare
-       && thr->task->taskgroup->prev
+   if (thr->task->taskgroup->workshare && thr->task->taskgroup->prev
        && thr->task->taskgroup->prev->cancelled)
      return true;
  }
@@ -733,11 +648,10 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
  tgt_size = 0;
     }
 
-  task = gomp_malloc (sizeof (*task) + depend_size
-       + sizeof (*ttask)
-       + mapnum * (sizeof (void *) + sizeof (size_t)
-   + sizeof (unsigned short))
-       + tgt_size);
+  task = gomp_malloc (
+    sizeof (*task) + depend_size + sizeof (*ttask)
+    + mapnum * (sizeof (void *) + sizeof (size_t) + sizeof (unsigned short))
+    + tgt_size);
   gomp_init_task (task, parent, gomp_icv (false));
   task->priority = 0;
   task->kind = GOMP_TASK_WAITING;
@@ -794,8 +708,7 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
  {
    if (taskgroup->cancelled)
      goto do_cancel;
-   if (taskgroup->workshare
-       && taskgroup->prev
+   if (taskgroup->workshare && taskgroup->prev
        && taskgroup->prev->cancelled)
      goto do_cancel;
  }
@@ -806,7 +719,9 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
       if (task->num_dependees)
  {
    if (taskgroup)
-     taskgroup->num_children++;
+     ++taskgroup->num_children;
+   ++parent->num_children;
+   ++team->task_count;
    gomp_mutex_unlock (&team->task_lock);
    return true;
  }
@@ -819,27 +734,19 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
       free (task);
       return false;
     }
-  if (taskgroup)
-    taskgroup->num_children++;
+
   /* For async offloading, if we don't need to wait for dependencies,
      run the gomp_target_task_fn right away, essentially schedule the
      mapping part of the task in the current thread.  */
-  if (devicep != NULL
-      && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+
+  if (taskgroup)
+    ++taskgroup->num_children;
+  ++parent->num_children;
+  ++team->task_count;
+
+  if (devicep != NULL && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
-      PRIORITY_INSERT_END,
-      /*adjust_parent_depends_on=*/false,
-      task->parent_depends_on);
-      if (taskgroup)
- priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-        task, 0, PRIORITY_INSERT_END,
-        /*adjust_parent_depends_on=*/false,
-        task->parent_depends_on);
-      task->pnode[PQ_TEAM].next = NULL;
-      task->pnode[PQ_TEAM].prev = NULL;
       task->kind = GOMP_TASK_TIED;
-      ++team->task_count;
       gomp_mutex_unlock (&team->task_lock);
 
       thr->task = task;
@@ -858,24 +765,16 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
       gomp_mutex_unlock (&team->task_lock);
       return true;
     }
-  priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
- PRIORITY_INSERT_BEGIN,
- /*adjust_parent_depends_on=*/false,
- task->parent_depends_on);
-  if (taskgroup)
-    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0,
-    PRIORITY_INSERT_BEGIN,
-    /*adjust_parent_depends_on=*/false,
-    task->parent_depends_on);
+
   priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0,
  PRIORITY_INSERT_END,
  /*adjust_parent_depends_on=*/false,
  task->parent_depends_on);
-  ++team->task_count;
+
   ++team->task_queued_count;
+
   gomp_team_barrier_set_task_pending (&team->barrier);
-  do_wake = team->task_running_count + !parent->in_tied_task
-     < team->nthreads;
+  do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads;
   gomp_mutex_unlock (&team->task_lock);
   if (do_wake)
     gomp_team_barrier_wake (&team->barrier, 1);
@@ -905,12 +804,10 @@  gomp_create_target_task (struct gomp_device_descr *devicep,
  V        V
  PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4.  */
 
-static void inline
-priority_list_upgrade_task (struct priority_list *list,
-     struct priority_node *node)
+static void inline priority_list_upgrade_task (struct priority_list *list,
+        struct priority_node *node)
 {
-  struct priority_node *last_parent_depends_on
-    = list->last_parent_depends_on;
+  struct priority_node *last_parent_depends_on = list->last_parent_depends_on;
   if (last_parent_depends_on)
     {
       node->prev->next = node->next;
@@ -933,166 +830,22 @@  priority_list_upgrade_task (struct priority_list *list,
   list->last_parent_depends_on = node;
 }
 
-/* Given a parent_depends_on TASK in its parent's children_queue, move
-   it to the front of its priority so it is run as soon as possible.
-
-   PARENT is passed as an optimization.
-
-   (This function could be defined in priority_queue.c, but we want it
-   inlined, and putting it in priority_queue.h is not an option, given
-   that gomp_task has not been properly defined at that point).  */
-
-static void inline
-priority_queue_upgrade_task (struct gomp_task *task,
-      struct gomp_task *parent)
-{
-  struct priority_queue *head = &parent->children_queue;
-  struct priority_node *node = &task->pnode[PQ_CHILDREN];
-#if _LIBGOMP_CHECKING_
-  if (!task->parent_depends_on)
-    gomp_fatal ("priority_queue_upgrade_task: task must be a "
- "parent_depends_on task");
-  if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task))
-    gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task);
-#endif
-  if (priority_queue_multi_p (head))
-    {
-      struct priority_list *list
- = priority_queue_lookup_priority (head, task->priority);
-      priority_list_upgrade_task (list, node);
-    }
-  else
-    priority_list_upgrade_task (&head->l, node);
-}
-
-/* Given a CHILD_TASK in LIST that is about to be executed, move it out of
-   the way in LIST so that other tasks can be considered for
-   execution.  LIST contains tasks of type TYPE.
-
-   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
-   if applicable.  */
-
-static void inline
-priority_list_downgrade_task (enum priority_queue_type type,
-       struct priority_list *list,
-       struct gomp_task *child_task)
-{
-  struct priority_node *node = task_to_priority_node (type, child_task);
-  if (list->tasks == node)
-    list->tasks = node->next;
-  else if (node->next != list->tasks)
-    {
-      /* The task in NODE is about to become TIED and TIED tasks
- cannot come before WAITING tasks.  If we're about to
- leave the queue in such an indeterminate state, rewire
- things appropriately.  However, a TIED task at the end is
- perfectly fine.  */
-      struct gomp_task *next_task = priority_node_to_task (type, node->next);
-      if (next_task->kind == GOMP_TASK_WAITING)
- {
-   /* Remove from list.  */
-   node->prev->next = node->next;
-   node->next->prev = node->prev;
-   /* Rewire at the end.  */
-   node->next = list->tasks;
-   node->prev = list->tasks->prev;
-   list->tasks->prev->next = node;
-   list->tasks->prev = node;
- }
-    }
-
-  /* If the current task is the last_parent_depends_on for its
-     priority, adjust last_parent_depends_on appropriately.  */
-  if (__builtin_expect (child_task->parent_depends_on, 0)
-      && list->last_parent_depends_on == node)
-    {
-      struct gomp_task *prev_child = priority_node_to_task (type, node->prev);
-      if (node->prev != node
-   && prev_child->kind == GOMP_TASK_WAITING
-   && prev_child->parent_depends_on)
- list->last_parent_depends_on = node->prev;
-      else
- {
-   /* There are no more parent_depends_on entries waiting
-      to run, clear the list.  */
-   list->last_parent_depends_on = NULL;
- }
-    }
-}
-
-/* Given a TASK in HEAD that is about to be executed, move it out of
-   the way so that other tasks can be considered for execution.  HEAD
-   contains tasks of type TYPE.
-
-   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
-   if applicable.
-
-   (This function could be defined in priority_queue.c, but we want it
-   inlined, and putting it in priority_queue.h is not an option, given
-   that gomp_task has not been properly defined at that point).  */
-
-static void inline
-priority_queue_downgrade_task (enum priority_queue_type type,
-        struct priority_queue *head,
-        struct gomp_task *task)
-{
-#if _LIBGOMP_CHECKING_
-  if (!priority_queue_task_in_queue_p (type, head, task))
-    gomp_fatal ("Attempt to downgrade missing task %p", task);
-#endif
-  if (priority_queue_multi_p (head))
-    {
-      struct priority_list *list
- = priority_queue_lookup_priority (head, task->priority);
-      priority_list_downgrade_task (type, list, task);
-    }
-  else
-    priority_list_downgrade_task (type, &head->l, task);
-}
-
-/* Setup CHILD_TASK to execute.  This is done by setting the task to
-   TIED, and updating all relevant queues so that CHILD_TASK is no
-   longer chosen for scheduling.  Also, remove CHILD_TASK from the
-   overall team task queue entirely.
-
-   Return TRUE if task or its containing taskgroup has been
-   cancelled.  */
-
 static inline bool
-gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
-    struct gomp_team *team)
+gomp_task_run_pre (struct gomp_task *task, struct gomp_team *team)
 {
 #if _LIBGOMP_CHECKING_
-  if (child_task->parent)
-    priority_queue_verify (PQ_CHILDREN,
-    &child_task->parent->children_queue, true);
-  if (child_task->taskgroup)
-    priority_queue_verify (PQ_TASKGROUP,
-    &child_task->taskgroup->taskgroup_queue, false);
   priority_queue_verify (PQ_TEAM, &team->task_queue, false);
 #endif
+  struct gomp_taskgroup *taskgroup = task->taskgroup;
 
-  /* Task is about to go tied, move it out of the way.  */
-  if (parent)
-    priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue,
-    child_task);
-
-  /* Task is about to go tied, move it out of the way.  */
-  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
-  if (taskgroup)
-    priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-    child_task);
-
-  priority_queue_remove (PQ_TEAM, &team->task_queue, child_task,
- MEMMODEL_RELAXED);
-  child_task->pnode[PQ_TEAM].next = NULL;
-  child_task->pnode[PQ_TEAM].prev = NULL;
-  child_task->kind = GOMP_TASK_TIED;
+  priority_queue_remove (PQ_TEAM, &team->task_queue, task, MEMMODEL_RELAXED);
+  task->pnode.next = NULL;
+  task->pnode.prev = NULL;
+  task->kind = GOMP_TASK_TIED;
 
   if (--team->task_queued_count == 0)
     gomp_team_barrier_clear_task_pending (&team->barrier);
-  if (__builtin_expect (gomp_cancel_var, 0)
-      && !child_task->copy_ctors_done)
+  if (__builtin_expect (gomp_cancel_var, 0) && !task->copy_ctors_done)
     {
       if (gomp_team_barrier_cancelled (&team->barrier))
  return true;
@@ -1100,8 +853,7 @@  gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
  {
    if (taskgroup->cancelled)
      return true;
-   if (taskgroup->workshare
-       && taskgroup->prev
+   if (taskgroup->workshare && taskgroup->prev
        && taskgroup->prev->cancelled)
      return true;
  }
@@ -1154,7 +906,6 @@  gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
   for (i = 0; i < count; i++)
     {
       struct gomp_task *task = child_task->dependers->elem[i];
-
       /* CHILD_TASK satisfies a dependency for TASK.  Keep track of
  TASK's remaining dependencies.  Once TASK has no other
  depenencies, put it into the various queues so it will get
@@ -1162,14 +913,14 @@  gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
       if (--task->num_dependees != 0)
  continue;
 
+      priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
+      PRIORITY_INSERT_END,
+      /*adjust_parent_depends_on=*/false,
+      task->parent_depends_on);
+
       struct gomp_taskgroup *taskgroup = task->taskgroup;
       if (parent)
  {
-   priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
- task, task->priority,
- PRIORITY_INSERT_BEGIN,
- /*adjust_parent_depends_on=*/true,
- task->parent_depends_on);
    if (parent->taskwait)
      {
        if (parent->taskwait->in_taskwait)
@@ -1190,11 +941,6 @@  gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
  }
       if (taskgroup)
  {
-   priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
- task, task->priority,
- PRIORITY_INSERT_BEGIN,
- /*adjust_parent_depends_on=*/false,
- task->parent_depends_on);
    if (taskgroup->in_taskgroup_wait)
      {
        /* One more task has had its dependencies met.
@@ -1203,12 +949,6 @@  gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
        gomp_sem_post (&taskgroup->taskgroup_sem);
      }
  }
-      priority_queue_insert (PQ_TEAM, &team->task_queue,
-      task, task->priority,
-      PRIORITY_INSERT_END,
-      /*adjust_parent_depends_on=*/false,
-      task->parent_depends_on);
-      ++team->task_count;
       ++team->task_queued_count;
       ++ret;
     }
@@ -1250,22 +990,26 @@  gomp_task_run_post_remove_parent (struct gomp_task *child_task)
      synchronize with gomp_task_maybe_wait_for_dependencies so it can
      clean up and return.  */
   if (__builtin_expect (child_task->parent_depends_on, 0)
-      && --parent->taskwait->n_depend == 0
-      && parent->taskwait->in_depend_wait)
+      && --parent->taskwait->n_depend == 0 && parent->taskwait->in_depend_wait)
     {
       parent->taskwait->in_depend_wait = false;
       gomp_sem_post (&parent->taskwait->taskwait_sem);
     }
 
-  if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue,
-      child_task, MEMMODEL_RELEASE)
-      && parent->taskwait && parent->taskwait->in_taskwait)
+  if (--parent->num_children == 0)
     {
-      parent->taskwait->in_taskwait = false;
-      gomp_sem_post (&parent->taskwait->taskwait_sem);
+      if (parent->kind == GOMP_DONE)
+ {
+   gomp_finish_task (parent);
+   free (parent);
+   parent = NULL;
+ }
+      else if (parent->taskwait && parent->taskwait->in_taskwait)
+ {
+   parent->taskwait->in_taskwait = false;
+   gomp_sem_post (&parent->taskwait->taskwait_sem);
+ }
     }
-  child_task->pnode[PQ_CHILDREN].next = NULL;
-  child_task->pnode[PQ_CHILDREN].prev = NULL;
 }
 
 /* Remove CHILD_TASK from its taskgroup.  */
@@ -1276,27 +1020,131 @@  gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
   if (taskgroup == NULL)
     return;
-  bool empty = priority_queue_remove (PQ_TASKGROUP,
-       &taskgroup->taskgroup_queue,
-       child_task, MEMMODEL_RELAXED);
-  child_task->pnode[PQ_TASKGROUP].next = NULL;
-  child_task->pnode[PQ_TASKGROUP].prev = NULL;
-  if (taskgroup->num_children > 1)
-    --taskgroup->num_children;
-  else
+  if (--taskgroup->num_children == 0)
     {
       /* We access taskgroup->num_children in GOMP_taskgroup_end
  outside of the task lock mutex region, so
  need a release barrier here to ensure memory
  written by child_task->fn above is flushed
  before the NULL is written.  */
-      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+      if (taskgroup->in_taskgroup_wait)
+ {
+   taskgroup->in_taskgroup_wait = false;
+   gomp_sem_post (&taskgroup->taskgroup_sem);
+ }
     }
-  if (empty && taskgroup->in_taskgroup_wait)
+}
+
+/* Executes all tasks until the team queue is empty.
+   The caller will check for a stop criterion and
+   called again if the criterion is not met.
+   true is returned if the routine was exited
+   becasue the team queue was empty.
+   *team, *thr are passed as an optimization */
+
+static inline bool
+gomp_execute_task (struct gomp_team *team, struct gomp_thread *thr,
+    struct gomp_task *task)
+{
+  bool cancelled = false;
+  bool ignored;
+  int do_wake = 0;
+  struct gomp_task *to_free = NULL;
+  struct gomp_task *next_task = NULL;
+
+#if _LIBGOMP_CHECKING_
+  if (priority_queue_empty_p (&team->task_queue, MEMMODEL_ACQUIRE)
+      && __atomic_load_n (&team->task_queued_count, MEMMODEL_ACQUIRE) != 0)
+    gomp_fatal ("Queue empty while queued task count is nonzero.");
+#endif
+
+  if (priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
+    return false;
+
+  next_task = priority_queue_next_task (PQ_TEAM, &team->task_queue, PQ_IGNORED,
+ NULL, &ignored);
+
+  if (next_task->kind == GOMP_TASK_WAITING)
+    {
+      cancelled = gomp_task_run_pre (next_task, team);
+      if (__builtin_expect (cancelled, 0))
+ goto finish_cancelled;
+    }
+  else
     {
-      taskgroup->in_taskgroup_wait = false;
-      gomp_sem_post (&taskgroup->taskgroup_sem);
+      /* All tasks we are waiting for are either running in other
+ threads, or they are tasks that have not had their
+ dependencies met (so they're not even in the queue).  Wait
+ for them.  */
+      next_task = NULL;
+      return false;
     }
+  gomp_mutex_unlock (&team->task_lock);
+  if (do_wake)
+    {
+      gomp_team_barrier_wake (&team->barrier, do_wake);
+      do_wake = 0;
+    }
+  if (next_task)
+    {
+      thr->task = next_task;
+      if (__builtin_expect (next_task->fn == NULL, 0))
+ {
+   if (gomp_target_task_fn (next_task->fn_data))
+     {
+       thr->task = task;
+       gomp_mutex_lock (&team->task_lock);
+       next_task->kind = GOMP_TASK_ASYNC_RUNNING;
+       struct gomp_target_task *ttask
+ = (struct gomp_target_task *) next_task->fn_data;
+       /* If GOMP_PLUGIN_target_task_completion has run already
+ in between gomp_target_task_fn and the mutex lock,
+ perform the requeuing here.  */
+       if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+ gomp_target_task_completion (team, next_task);
+       else
+ ttask->state = GOMP_TARGET_TASK_RUNNING;
+       next_task = NULL;
+       return true;
+     }
+ }
+      else
+ next_task->fn (next_task->fn_data);
+      thr->task = task;
+    }
+  else
+    {
+      gomp_mutex_lock (&team->task_lock);
+      return false;
+    }
+  gomp_mutex_lock (&team->task_lock);
+  if (next_task)
+    {
+    finish_cancelled:;
+      size_t new_tasks = gomp_task_run_post_handle_depend (next_task, team);
+
+      gomp_task_run_post_remove_parent (next_task);
+      gomp_task_run_post_remove_taskgroup (next_task);
+
+      to_free = next_task;
+      to_free->kind = GOMP_DONE;
+      next_task = NULL;
+      team->task_count--;
+      if (new_tasks > 1)
+ {
+   do_wake
+     = team->nthreads - team->task_running_count - !task->in_tied_task;
+   if (do_wake > new_tasks)
+     do_wake = new_tasks;
+ }
+    }
+  if (to_free && to_free->num_children == 0)
+    {
+      gomp_finish_task (to_free);
+      free (to_free);
+      to_free = NULL;
+    }
+  return true;
 }
 
 void
@@ -1322,21 +1170,18 @@  gomp_barrier_handle_tasks (gomp_barrier_state_t state)
       gomp_team_barrier_set_waiting_for_tasks (&team->barrier);
     }
 
-  while (1)
+  while (true)
     {
       bool cancelled = false;
       if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
  {
    bool ignored;
-   child_task
-     = priority_queue_next_task (PQ_TEAM, &team->task_queue,
- PQ_IGNORED, NULL,
- &ignored);
-   cancelled = gomp_task_run_pre (child_task, child_task->parent,
- team);
+   child_task = priority_queue_next_task (PQ_TEAM, &team->task_queue,
+ PQ_IGNORED, NULL, &ignored);
+   cancelled = gomp_task_run_pre (child_task, team);
    if (__builtin_expect (cancelled, 0))
      {
-       if (to_free)
+       if (to_free && to_free->num_children == 0)
  {
    gomp_finish_task (to_free);
    free (to_free);
@@ -1353,7 +1198,7 @@  gomp_barrier_handle_tasks (gomp_barrier_state_t state)
    gomp_team_barrier_wake (&team->barrier, do_wake);
    do_wake = 0;
  }
-      if (to_free)
+      if (to_free && to_free->num_children == 0)
  {
    gomp_finish_task (to_free);
    free (to_free);
@@ -1392,13 +1237,13 @@  gomp_barrier_handle_tasks (gomp_barrier_state_t state)
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
  {
- finish_cancelled:;
+ finish_cancelled:;
    size_t new_tasks
      = gomp_task_run_post_handle_depend (child_task, team);
    gomp_task_run_post_remove_parent (child_task);
-   gomp_clear_parent (&child_task->children_queue);
    gomp_task_run_post_remove_taskgroup (child_task);
    to_free = child_task;
+   to_free->kind = GOMP_DONE;
    child_task = NULL;
    if (!cancelled)
      team->task_running_count--;
@@ -1430,10 +1275,7 @@  GOMP_taskwait (void)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
   struct gomp_taskwait taskwait;
-  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
      with the write of a NULL in gomp_task_run_post_remove_parent.  It is
@@ -1442,54 +1284,24 @@  GOMP_taskwait (void)
      child thread task work function are seen before we exit from
      GOMP_taskwait.  */
   if (task == NULL
-      || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE))
+      || __atomic_load_n (&task->num_children, MEMMODEL_ACQUIRE) == 0)
     return;
 
   memset (&taskwait, 0, sizeof (taskwait));
-  bool child_q = false;
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
-      bool cancelled = false;
-      if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED))
+      if (__atomic_load_n (&task->num_children, MEMMODEL_RELAXED) == 0)
  {
    bool destroy_taskwait = task->taskwait != NULL;
    task->taskwait = NULL;
    gomp_mutex_unlock (&team->task_lock);
-   if (to_free)
-     {
-       gomp_finish_task (to_free);
-       free (to_free);
-     }
    if (destroy_taskwait)
      gomp_sem_destroy (&taskwait.taskwait_sem);
    return;
  }
-      struct gomp_task *next_task
- = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-     PQ_TEAM, &team->task_queue, &child_q);
-      if (next_task->kind == GOMP_TASK_WAITING)
+      if (!gomp_execute_task (team, thr, task))
  {
-   child_task = next_task;
-   cancelled
-     = gomp_task_run_pre (child_task, task, team);
-   if (__builtin_expect (cancelled, 0))
-     {
-       if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-       goto finish_cancelled;
-     }
- }
-      else
- {
- /* All tasks we are waiting for are either running in other
-    threads, or they are tasks that have not had their
-    dependencies met (so they're not even in the queue).  Wait
-    for them.  */
    if (task->taskwait == NULL)
      {
        taskwait.in_depend_wait = false;
@@ -1497,77 +1309,9 @@  GOMP_taskwait (void)
        task->taskwait = &taskwait;
      }
    taskwait.in_taskwait = true;
- }
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
- {
-   gomp_team_barrier_wake (&team->barrier, do_wake);
-   do_wake = 0;
- }
-      if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-      if (child_task)
- {
-   thr->task = child_task;
-   if (__builtin_expect (child_task->fn == NULL, 0))
-     {
-       if (gomp_target_task_fn (child_task->fn_data))
- {
-   thr->task = task;
-   gomp_mutex_lock (&team->task_lock);
-   child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-   struct gomp_target_task *ttask
-     = (struct gomp_target_task *) child_task->fn_data;
-   /* If GOMP_PLUGIN_target_task_completion has run already
-      in between gomp_target_task_fn and the mutex lock,
-      perform the requeuing here.  */
-   if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-     gomp_target_task_completion (team, child_task);
-   else
-     ttask->state = GOMP_TARGET_TASK_RUNNING;
-   child_task = NULL;
-   continue;
- }
-     }
-   else
-     child_task->fn (child_task->fn_data);
-   thr->task = task;
- }
-      else
- gomp_sem_wait (&taskwait.taskwait_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
- {
- finish_cancelled:;
-   size_t new_tasks
-     = gomp_task_run_post_handle_depend (child_task, team);
-
-   if (child_q)
-     {
-       priority_queue_remove (PQ_CHILDREN, &task->children_queue,
-      child_task, MEMMODEL_RELAXED);
-       child_task->pnode[PQ_CHILDREN].next = NULL;
-       child_task->pnode[PQ_CHILDREN].prev = NULL;
-     }
-
-   gomp_clear_parent (&child_task->children_queue);
-
-   gomp_task_run_post_remove_taskgroup (child_task);
-
-   to_free = child_task;
-   child_task = NULL;
-   team->task_count--;
-   if (new_tasks > 1)
-     {
-       do_wake = team->nthreads - team->task_running_count
- - !task->in_tied_task;
-       if (do_wake > new_tasks)
- do_wake = new_tasks;
-     }
+   gomp_mutex_unlock (&team->task_lock);
+   gomp_sem_wait (&taskwait.taskwait_sem);
+   gomp_mutex_lock (&team->task_lock);
  }
     }
 }
@@ -1590,8 +1334,7 @@  GOMP_taskwait_depend (void **depend)
  {
    if (thr->task->taskgroup->cancelled)
      return;
-   if (thr->task->taskgroup->workshare
-       && thr->task->taskgroup->prev
+   if (thr->task->taskgroup->workshare && thr->task->taskgroup->prev
        && thr->task->taskgroup->prev->cancelled)
      return;
  }
@@ -1629,9 +1372,6 @@  gomp_task_maybe_wait_for_dependencies (void **depend)
   size_t n = 2;
   size_t i;
   size_t num_awaited = 0;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  int do_wake = 0;
 
   if (ndepend == 0)
     {
@@ -1674,11 +1414,9 @@  gomp_task_maybe_wait_for_dependencies (void **depend)
        {
  tsk->parent_depends_on = true;
  ++num_awaited;
- /* If depenency TSK itself has no dependencies and is
-    ready to run, move it up front so that we run it as
-    soon as possible.  */
- if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
-   priority_queue_upgrade_task (tsk, task);
+ /*  Previously, the dependencies were upgraded their priorities.
+     I'm not sure if not upgrading the depedencies will not lead
+     to a possible deadlock in a single queue situation. */
        }
    }
     }
@@ -1689,133 +1427,26 @@  gomp_task_maybe_wait_for_dependencies (void **depend)
     }
 
   memset (&taskwait, 0, sizeof (taskwait));
-  taskwait.n_depend = num_awaited;
   gomp_sem_init (&taskwait.taskwait_sem, 0);
+  taskwait.n_depend = num_awaited;
   task->taskwait = &taskwait;
 
-  while (1)
+  while (true)
     {
-      bool cancelled = false;
       if (taskwait.n_depend == 0)
  {
    task->taskwait = NULL;
    gomp_mutex_unlock (&team->task_lock);
-   if (to_free)
-     {
-       gomp_finish_task (to_free);
-       free (to_free);
-     }
    gomp_sem_destroy (&taskwait.taskwait_sem);
    return;
  }
 
-      /* Theoretically when we have multiple priorities, we should
- chose between the highest priority item in
- task->children_queue and team->task_queue here, so we should
- use priority_queue_next_task().  However, since we are
- running an undeferred task, perhaps that makes all tasks it
- depends on undeferred, thus a priority of INF?  This would
- make it unnecessary to take anything into account here,
- but the dependencies.
-
- On the other hand, if we want to use priority_queue_next_task(),
- care should be taken to only use priority_queue_remove()
- below if the task was actually removed from the children
- queue.  */
-      bool ignored;
-      struct gomp_task *next_task
- = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-     PQ_IGNORED, NULL, &ignored);
-
-      if (next_task->kind == GOMP_TASK_WAITING)
- {
-   child_task = next_task;
-   cancelled
-     = gomp_task_run_pre (child_task, task, team);
-   if (__builtin_expect (cancelled, 0))
-     {
-       if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-       goto finish_cancelled;
-     }
- }
-      else
- /* All tasks we are waiting for are either running in other
-    threads, or they are tasks that have not had their
-    dependencies met (so they're not even in the queue).  Wait
-    for them.  */
- taskwait.in_depend_wait = true;
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
- {
-   gomp_team_barrier_wake (&team->barrier, do_wake);
-   do_wake = 0;
- }
-      if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-      if (child_task)
- {
-   thr->task = child_task;
-   if (__builtin_expect (child_task->fn == NULL, 0))
-     {
-       if (gomp_target_task_fn (child_task->fn_data))
- {
-   thr->task = task;
-   gomp_mutex_lock (&team->task_lock);
-   child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-   struct gomp_target_task *ttask
-     = (struct gomp_target_task *) child_task->fn_data;
-   /* If GOMP_PLUGIN_target_task_completion has run already
-      in between gomp_target_task_fn and the mutex lock,
-      perform the requeuing here.  */
-   if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-     gomp_target_task_completion (team, child_task);
-   else
-     ttask->state = GOMP_TARGET_TASK_RUNNING;
-   child_task = NULL;
-   continue;
- }
-     }
-   else
-     child_task->fn (child_task->fn_data);
-   thr->task = task;
- }
-      else
- gomp_sem_wait (&taskwait.taskwait_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
+      if (!gomp_execute_task (team, thr, task))
  {
- finish_cancelled:;
-   size_t new_tasks
-     = gomp_task_run_post_handle_depend (child_task, team);
-   if (child_task->parent_depends_on)
-     --taskwait.n_depend;
-
-   priority_queue_remove (PQ_CHILDREN, &task->children_queue,
- child_task, MEMMODEL_RELAXED);
-   child_task->pnode[PQ_CHILDREN].next = NULL;
-   child_task->pnode[PQ_CHILDREN].prev = NULL;
-
-   gomp_clear_parent (&child_task->children_queue);
-   gomp_task_run_post_remove_taskgroup (child_task);
-   to_free = child_task;
-   child_task = NULL;
-   team->task_count--;
-   if (new_tasks > 1)
-     {
-       do_wake = team->nthreads - team->task_running_count
- - !task->in_tied_task;
-       if (do_wake > new_tasks)
- do_wake = new_tasks;
-     }
+   taskwait.in_depend_wait = true;
+   gomp_mutex_unlock (&team->task_lock);
+   gomp_sem_wait (&taskwait.taskwait_sem);
+   gomp_mutex_lock (&team->task_lock);
  }
     }
 }
@@ -1834,7 +1465,7 @@  gomp_taskgroup_init (struct gomp_taskgroup *prev)
   struct gomp_taskgroup *taskgroup
     = gomp_malloc (sizeof (struct gomp_taskgroup));
   taskgroup->prev = prev;
-  priority_queue_init (&taskgroup->taskgroup_queue);
+  // priority_queue_init (&taskgroup->taskgroup_queue);
   taskgroup->reductions = prev ? prev->reductions : NULL;
   taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
@@ -1867,15 +1498,11 @@  GOMP_taskgroup_end (void)
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
   struct gomp_taskgroup *taskgroup;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  int do_wake = 0;
 
   if (team == NULL)
     return;
   taskgroup = task->taskgroup;
-  if (__builtin_expect (taskgroup == NULL, 0)
-      && thr->ts.level == 0)
+  if (__builtin_expect (taskgroup == NULL, 0) && thr->ts.level == 0)
     {
       /* This can happen if GOMP_taskgroup_start is called when
  thr->ts.team == NULL, but inside of the taskgroup there
@@ -1895,134 +1522,30 @@  GOMP_taskgroup_end (void)
   if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
     goto finish;
 
-  bool unused;
   gomp_mutex_lock (&team->task_lock);
-  while (1)
+  while (true)
     {
-      bool cancelled = false;
-      if (priority_queue_empty_p (&taskgroup->taskgroup_queue,
-   MEMMODEL_RELAXED))
+      if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_RELAXED) == 0)
  {
-   if (taskgroup->num_children)
-     {
-       if (priority_queue_empty_p (&task->children_queue,
-   MEMMODEL_RELAXED))
- goto do_wait;
-       child_task
- = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-     PQ_TEAM, &team->task_queue,
-     &unused);
-     }
-   else
-     {
-       gomp_mutex_unlock (&team->task_lock);
-       if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
- }
-       goto finish;
-     }
- }
-      else
- child_task
-   = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-       PQ_TEAM, &team->task_queue, &unused);
-      if (child_task->kind == GOMP_TASK_WAITING)
- {
-   cancelled
-     = gomp_task_run_pre (child_task, child_task->parent, team);
-   if (__builtin_expect (cancelled, 0))
-     {
-       if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-       goto finish_cancelled;
-     }
+   gomp_mutex_unlock (&team->task_lock);
+   goto finish;
  }
-      else
+      if (!gomp_execute_task (team, thr, task))
  {
-   child_task = NULL;
- do_wait:
- /* All tasks we are waiting for are either running in other
-    threads, or they are tasks that have not had their
-    dependencies met (so they're not even in the queue).  Wait
-    for them.  */
    taskgroup->in_taskgroup_wait = true;
- }
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
- {
-   gomp_team_barrier_wake (&team->barrier, do_wake);
-   do_wake = 0;
- }
-      if (to_free)
- {
-   gomp_finish_task (to_free);
-   free (to_free);
-   to_free = NULL;
- }
-      if (child_task)
- {
-   thr->task = child_task;
-   if (__builtin_expect (child_task->fn == NULL, 0))
-     {
-       if (gomp_target_task_fn (child_task->fn_data))
- {
-   thr->task = task;
-   gomp_mutex_lock (&team->task_lock);
-   child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-   struct gomp_target_task *ttask
-     = (struct gomp_target_task *) child_task->fn_data;
-   /* If GOMP_PLUGIN_target_task_completion has run already
-      in between gomp_target_task_fn and the mutex lock,
-      perform the requeuing here.  */
-   if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-     gomp_target_task_completion (team, child_task);
-   else
-     ttask->state = GOMP_TARGET_TASK_RUNNING;
-   child_task = NULL;
-   continue;
- }
-     }
-   else
-     child_task->fn (child_task->fn_data);
-   thr->task = task;
- }
-      else
- gomp_sem_wait (&taskgroup->taskgroup_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
- {
- finish_cancelled:;
-   size_t new_tasks
-     = gomp_task_run_post_handle_depend (child_task, team);
-   gomp_task_run_post_remove_parent (child_task);
-   gomp_clear_parent (&child_task->children_queue);
-   gomp_task_run_post_remove_taskgroup (child_task);
-   to_free = child_task;
-   child_task = NULL;
-   team->task_count--;
-   if (new_tasks > 1)
-     {
-       do_wake = team->nthreads - team->task_running_count
- - !task->in_tied_task;
-       if (do_wake > new_tasks)
- do_wake = new_tasks;
-     }
+   gomp_mutex_unlock (&team->task_lock);
+   gomp_sem_wait (&taskgroup->taskgroup_sem);
+   gomp_mutex_lock (&team->task_lock);
  }
     }
 
- finish:
+finish:
   task->taskgroup = taskgroup->prev;
   gomp_sem_destroy (&taskgroup->taskgroup_sem);
   free (taskgroup);
 }
 
-static inline __attribute__((always_inline)) void
+static inline __attribute__ ((always_inline)) void
 gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
  unsigned nthreads)
 {
@@ -2095,7 +1618,7 @@  gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
       to hash also on the first sizeof (uintptr_t) bytes which contain
       a pointer.  Hide the cast from the compiler.  */
    hash_entry_type n;
-   __asm ("" : "=g" (n) : "0" (p));
+   __asm("" : "=g"(n) : "0"(p));
    *htab_find_slot (&new_htab, n, INSERT) = n;
  }
       if (d[4] == (uintptr_t) old)
@@ -2192,14 +1715,13 @@  GOMP_taskgroup_reduction_unregister (uintptr_t *data)
 }
 ialias (GOMP_taskgroup_reduction_unregister)
 
-/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
-   original list item or address of previously remapped original list
-   item to address of the private copy, store that to ptrs[i].
-   For i < cntorig, additionally set ptrs[cnt+i] to the address of
-   the original list item.  */
+  /* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
+     original list item or address of previously remapped original list
+     item to address of the private copy, store that to ptrs[i].
+     For i < cntorig, additionally set ptrs[cnt+i] to the address of
+     the original list item.  */
 
-void
-GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
+  void GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_task *task = thr->task;
@@ -2211,12 +1733,12 @@  GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
   for (i = 0; i < cnt; ++i)
     {
       hash_entry_type ent, n;
-      __asm ("" : "=g" (ent) : "0" (ptrs + i));
+      __asm("" : "=g"(ent) : "0"(ptrs + i));
       n = htab_find (reduction_htab, ent);
       if (n)
  {
    uintptr_t *p;
-   __asm ("" : "=g" (p) : "0" (n));
+   __asm("" : "=g"(p) : "0"(n));
    /* At this point, p[0] should be equal to (uintptr_t) ptrs[i],
       p[1] is the offset within the allocated chunk for each
       thread, p[2] is the array registered with
@@ -2238,7 +1760,8 @@  GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
  }
       if (d == NULL)
  gomp_fatal ("couldn't find matching task_reduction or reduction with "
-     "task modifier for %p", ptrs[i]);
+     "task modifier for %p",
+     ptrs[i]);
       uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1];
       ptrs[i] = (void *) (d[2] + id * d[1] + off);
       if (__builtin_expect (i < cntorig, 0))
@@ -2259,7 +1782,8 @@  GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
      }
    if (lo > hi)
      gomp_fatal ("couldn't find matching task_reduction or reduction "
- "with task modifier for %p", ptrs[i]);
+ "with task modifier for %p",
+ ptrs[i]);
  }
     }
 }
diff --git a/libgomp/taskloop.c b/libgomp/taskloop.c
index 5d3f810a8f2..4943ca06b2e 100644
--- a/libgomp/taskloop.c
+++ b/libgomp/taskloop.c
@@ -216,13 +216,13 @@  GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
  task_step -= step;
        fn (arg);
        arg += arg_size;
-       if (!priority_queue_empty_p (&task[i].children_queue,
-    MEMMODEL_RELAXED))
- {
-   gomp_mutex_lock (&team->task_lock);
-   gomp_clear_parent (&task[i].children_queue);
-   gomp_mutex_unlock (&team->task_lock);
- }
+       /* if (!priority_queue_empty_p (&task[i].children_queue, */
+       /*    MEMMODEL_RELAXED)) */
+       /* { */
+       /*   gomp_mutex_lock (&team->task_lock); */
+       /*   gomp_clear_parent (&task[i].children_queue); */
+       /*   gomp_mutex_unlock (&team->task_lock); */
+       /* } */
        gomp_end_task ();
      }
  }
@@ -248,13 +248,6 @@  GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
      if (i == nfirst)
        task_step -= step;
      fn (data);
-     if (!priority_queue_empty_p (&task.children_queue,
- MEMMODEL_RELAXED))
-       {
- gomp_mutex_lock (&team->task_lock);
- gomp_clear_parent (&task.children_queue);
- gomp_mutex_unlock (&team->task_lock);
-       }
      gomp_end_task ();
    }
     }
@@ -329,24 +322,15 @@  GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
  }
       if (taskgroup)
  taskgroup->num_children += num_tasks;
+      parent->num_children += num_tasks;
+      team->task_count += num_tasks;
       for (i = 0; i < num_tasks; i++)
  {
    struct gomp_task *task = tasks[i];
-   priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
- task, priority,
- PRIORITY_INSERT_BEGIN,
- /*last_parent_depends_on=*/false,
- task->parent_depends_on);
-   if (taskgroup)
-     priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-    task, priority, PRIORITY_INSERT_BEGIN,
-    /*last_parent_depends_on=*/false,
-    task->parent_depends_on);
    priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority,
  PRIORITY_INSERT_END,
  /*last_parent_depends_on=*/false,
  task->parent_depends_on);
-   ++team->task_count;
    ++team->task_queued_count;
  }
       gomp_team_barrier_set_task_pending (&team->barrier);