[libfortran] Fix race condition in asynchronous I/O

Message ID 63ab72d7-5629-6d41-41e5-7f00b2b0f196@netcologne.de
State New
Headers show
Series
  • [libfortran] Fix race condition in asynchronous I/O
Related show

Commit Message

Thomas Koenig Feb. 17, 2020, 10:02 p.m.
Hello world,

as you can see from the PR, async_io_4.f90 exhibited a race condition
every few thousands to tens of thousands of cases.

This was difficult to track down, and took some thinking, discussion
with Nicolas and enabling of the debugging feature of the async I/O,
which finally produced a failure which could then be analyzed.

The solution was to remove the mutexes for the different conditions that
can happen in the async library, relying on the lock that protects the
async I/O queue instead.

Tested by Bill Seurer (thanks!) with a previous version of the patch,
and with valgrind --tool=helgrind and valgrind --tool=drd, both of
which showed no (new) failures for all of the async_io_*.f90 tests.

So, OK for trunk and gcc-9?

Regards

	Thomas

2020-02-17  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/93599
	* io/async.c (destroy_adv_cond): Do not destroy lock.
	(async_io): Make sure au->lock is locked for finishing of thread.
	Do not lock/unlock around signalling emptysignal. Unlock au->lock
	before return.
	(init_adv_cond): Do not initialize lock.
	(enqueue_transfer): Unlock after signal.
	(enqueue_done_id): Likewise.
	(enqueue_done): Likewise.
	(enqueue_close): Likewise.
	(enqueue_data_transfer): Likewise.
	(async_wait_id): Do not lock/unlock around signalling au->work.
	(async_wait): Unlock after signal.
	* io/async.h (SIGNAL): Add comment about needed au->lock.
	Remove locking/unlocking of advcond->lock.
	(WAIT_SIGNAL_MUTEX): Add comment. Remove locking/unlocking of
	advcond->lock.  Unlock mutex only at the end.  Loop on
	__ghread_cond_wait returning zero.
	(REVOKE_SIGNAL): Add comment. Remove locking/unlocking of
	advcond->lock.
	(struct adv_cond): Remove mutex from struct.

Comments

Thomas Koenig Feb. 18, 2020, 8:02 p.m. | #1
Now committed to trunk after Jerry's OK, as
https://gcc.gnu.org/g:3fe1910509e32d611b3a7b8503502103bc53b5e4

I have to say I still have trouble establishing a git workflow
which works smoothly.  You will notice an "asdf" in the commit
message which was not really meant to be there...

Regards

	Thomas

Patch

diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c
index ab214af8a66..63b9158c0ba 100644
--- a/libgfortran/io/async.c
+++ b/libgfortran/io/async.c
@@ -80,7 +80,6 @@  update_pdt (st_parameter_dt **old, st_parameter_dt *new) {
 static void
 destroy_adv_cond (struct adv_cond *ac)
 {
-  T_ERROR (__gthread_mutex_destroy, &ac->lock);
   T_ERROR (__gthread_cond_destroy, &ac->signal);
 }
 
@@ -156,6 +155,7 @@  async_io (void *arg)
 
 		case AIO_CLOSE:
 		  NOTE ("Received AIO_CLOSE");
+		  LOCK (&au->lock);
 		  goto finish_thread;
 
 		default:
@@ -175,7 +175,6 @@  async_io (void *arg)
 	      else if (ctq->type == AIO_CLOSE)
 		{
 		  NOTE ("Received AIO_CLOSE during error condition");
-		  UNLOCK (&au->lock);
 		  goto finish_thread;
 		}
 	    }
@@ -189,9 +188,7 @@  async_io (void *arg)
       au->tail = NULL;
       au->head = NULL;
       au->empty = 1;
-      UNLOCK (&au->lock);
       SIGNAL (&au->emptysignal);
-      LOCK (&au->lock);
     }
  finish_thread:
   au->tail = NULL;
@@ -199,6 +196,7 @@  async_io (void *arg)
   au->empty = 1;
   SIGNAL (&au->emptysignal);
   free (ctq);
+  UNLOCK (&au->lock);
   return NULL;
 }
 
@@ -223,7 +221,6 @@  static void
 init_adv_cond (struct adv_cond *ac)
 {
   ac->pending = 0;
-  __GTHREAD_MUTEX_INIT_FUNCTION (&ac->lock);
   __GTHREAD_COND_INIT_FUNCTION (&ac->signal);
 }
 
@@ -279,8 +276,8 @@  enqueue_transfer (async_unit *au, transfer_args *arg, enum aio_do type)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Enqueue an st_write_done or st_read_done which contains an ID.  */
@@ -303,8 +300,8 @@  enqueue_done_id (async_unit *au, enum aio_do type)
   au->empty = false;
   ret = au->id.high++;
   NOTE ("Enqueue id: %d", ret);
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
   return ret;
 }
 
@@ -324,8 +321,8 @@  enqueue_done (async_unit *au, enum aio_do type)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Enqueue a CLOSE statement.  */
@@ -344,8 +341,8 @@  enqueue_close (async_unit *au)
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
   au->empty = false;
-  UNLOCK (&au->lock);
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* The asynchronous unit keeps the currently active PDT around.
@@ -374,9 +371,9 @@  enqueue_data_transfer_init (async_unit *au, st_parameter_dt *dt, int read_flag)
     au->tail->next = tq;
   au->tail = tq;
   REVOKE_SIGNAL (&(au->emptysignal));
-  au->empty = 0;
-  UNLOCK (&au->lock);
+  au->empty = false;
   SIGNAL (&au->work);
+  UNLOCK (&au->lock);
 }
 
 /* Collect the errors that may have happened asynchronously.  Return true if
@@ -430,9 +427,7 @@  async_wait_id (st_parameter_common *cmp, async_unit *au, int i)
   NOTE ("Waiting for id %d", i);
   if (au->id.waiting < i)
     au->id.waiting = i;
-  UNLOCK (&au->lock);
   SIGNAL (&(au->work));
-  LOCK (&au->lock);
   WAIT_SIGNAL_MUTEX (&(au->id.done),
 		     (au->id.low >= au->id.waiting || au->empty), &au->lock);
   LOCK (&au->lock);
@@ -454,8 +449,8 @@  async_wait (st_parameter_common *cmp, async_unit *au)
   if (cmp == NULL)
     cmp = au->error.cmp;
 
-  SIGNAL (&(au->work));
   LOCK (&(au->lock));
+  SIGNAL (&(au->work));
 
   if (au->empty)
     {
diff --git a/libgfortran/io/async.h b/libgfortran/io/async.h
index c6b2e0f94bd..17d303c127b 100644
--- a/libgfortran/io/async.h
+++ b/libgfortran/io/async.h
@@ -229,44 +229,44 @@ 
 
 #if ASYNC_IO
 
+/* au->lock has to be held when calling this macro.  */
+
 #define SIGNAL(advcond) do{						\
-    INTERN_LOCK (&(advcond)->lock);					\
     (advcond)->pending = 1;						\
     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "SIGNAL: " DEBUG_NORM \
 		 #advcond, __FUNCTION__, __LINE__, (void *) advcond);	\
     T_ERROR (__gthread_cond_broadcast, &(advcond)->signal);			\
-    INTERN_UNLOCK (&(advcond)->lock);					\
   } while (0)
 
+/* Has to be entered with mutex locked.  */
+
 #define WAIT_SIGNAL_MUTEX(advcond, condition, mutex) do{		\
     __label__ finish;		       					\
-    INTERN_LOCK (&((advcond)->lock));					\
     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_BLUE "WAITING: " DEBUG_NORM \
 		 #advcond, __FUNCTION__, __LINE__, (void *) advcond);	\
-    if ((advcond)->pending || (condition)){				\
-      UNLOCK (mutex);							\
+    if ((advcond)->pending || (condition))				\
       goto finish;							\
-    }									\
-    UNLOCK (mutex);							\
-     while (!__gthread_cond_wait(&(advcond)->signal, &(advcond)->lock)) {	\
-       { int cond;							\
-	 LOCK (mutex); cond = condition; UNLOCK (mutex);	\
-	   if (cond){							\
-	     DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE "REC: " DEBUG_NORM \
+    while (1)								\
+      {									\
+	int err_ret = __gthread_cond_wait(&(advcond)->signal, mutex);	\
+	if (err_ret) internal_error (NULL, "WAIT_SIGNAL_MUTEX failed");	\
+	if (condition)							\
+	  {								\
+	    DEBUG_PRINTF ("%s%-75s %20s():%-5d %18p\n", aio_prefix, DEBUG_ORANGE \
+			  "REC: " DEBUG_NORM				\
 			  #advcond,  __FUNCTION__, __LINE__, (void *)advcond); \
 	    break;				      			\
 	  }								\
       }									\
-    }									\
   finish:								\
     (advcond)->pending = 0;						\
-		 INTERN_UNLOCK (&((advcond)->lock));			\
+    UNLOCK (mutex);							\
   } while (0)
 
+/* au->lock has to be held when calling this macro.  */
+
 #define REVOKE_SIGNAL(advcond) do{		\
-    INTERN_LOCK (&(advcond)->lock);		\
     (advcond)->pending = 0;			\
-    INTERN_UNLOCK (&(advcond)->lock);		\
   } while (0)
 
 #else
@@ -330,7 +330,6 @@  struct adv_cond
 {
 #if ASYNC_IO
   int pending;
-  __gthread_mutex_t lock;
   __gthread_cond_t signal;
 #endif
 };