2019-05-13  Chung-Lin Tang  <cltang@codesourcery.com>

	Reviewed-by: Thomas Schwinge <thomas@codesourcery.com>

	libgomp/
	* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
	(struct goacc_asyncqueue_list): Likewise.
	(goacc_aq): Likewise.
	(goacc_aq_list): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_async_test): Remove.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust declaration.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
	(GOMP_OFFLOAD_openacc_async_exec): Declare.
	(GOMP_OFFLOAD_openacc_async_construct): Declare.
	(GOMP_OFFLOAD_openacc_async_destruct): Declare.
	(GOMP_OFFLOAD_openacc_async_test): Declare.
	(GOMP_OFFLOAD_openacc_async_synchronize): Declare.
	(GOMP_OFFLOAD_openacc_async_serialize): Declare.
	(GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
	(GOMP_OFFLOAD_openacc_async_host2dev): Declare.
	(GOMP_OFFLOAD_openacc_async_dev2host): Declare.

	* libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
	(gomp_acc_insert_pointer): Adjust declaration.
	(gomp_copy_host2dev): New declaration.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_async): Likewise.
	(gomp_unmap_tgt): Likewise.
	(gomp_unmap_vars_async): Likewise.
	(gomp_fini_device): Likewise.

	* oacc-async.c (get_goacc_thread): New function.
	(get_goacc_thread_device): New function.
	(lookup_goacc_asyncqueue): New function.
	(get_goacc_asyncqueue): New function.
	(acc_async_test): Adjust code to use new async design.
	(acc_async_test_all): Likewise.
	(acc_wait): Likewise.
	(acc_wait_async): Likewise.
	(acc_wait_all): Likewise.
	(acc_wait_all_async): Likewise.
	(goacc_async_free): New function.
	(goacc_init_asyncqueues): Likewise.
	(goacc_fini_asyncqueues): Likewise.
	* oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
	design.
	(acc_set_cuda_stream): Likewise.
	* oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'.
	(host_openacc_register_async_cleanup): Remove.
	(host_openacc_async_exec): New function.
	(host_openacc_async_test): Adjust parameters.
	(host_openacc_async_test_all): Remove.
	(host_openacc_async_wait): Remove.
	(host_openacc_async_wait_async): Remove.
	(host_openacc_async_wait_all): Remove.
	(host_openacc_async_wait_all_async): Remove.
	(host_openacc_async_set_async): Remove.
	(host_openacc_async_synchronize): New function.
	(host_openacc_async_serialize): New function.
	(host_openacc_async_host2dev): New function.
	(host_openacc_async_dev2host): New function.
	(host_openacc_async_queue_callback): New function.
	(host_openacc_async_construct): New function.
	(host_openacc_async_destruct): New function.
	(struct gomp_device_descr host_dispatch): Remove initialization of old
	interface, add intialization of new async sub-struct.
	* oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
	(goacc_attach_host_thread_to_device): Remove old async code usage.
	* oacc-int.h (goacc_init_asyncqueues): New declaration.
	(goacc_fini_asyncqueues): Likewise.
	(goacc_async_copyout_unmap_vars): Likewise.
	(goacc_async_free): Likewise.
	(get_goacc_asyncqueue): Likewise.
	(lookup_goacc_asyncqueue): Likewise.

	* oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
	design.
	(present_create_copy): Adjust code to use new async design.
	(delete_copyout): Likewise.
	(update_dev_host): Likewise.
	(gomp_acc_insert_pointer): Add async parameter, adjust code to use new
	async design.
	(gomp_acc_remove_pointer): Adjust code to use new async design.
	* oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async
	design.
	(GOACC_enter_exit_data): Likewise.
	(goacc_wait): Likewise.
	(GOACC_update): Likewise.
	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Change to assert fail
	when called, warn as obsolete in comment.

	* target.c (goacc_device_copy_async): New function.
	(gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
	add goacc_device_copy_async case.
	(gomp_copy_dev2host): Likewise.
	(gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
	(gomp_map_pointer): Likewise.
	(gomp_map_fields_existing): Likewise.
	(gomp_map_vars_internal): New always_inline function, renamed from
	gomp_map_vars.
	(gomp_map_vars): Implement by calling gomp_map_vars_internal.
	(gomp_map_vars_async): Implement by calling gomp_map_vars_internal,
	passing goacc_asyncqueue argument.
	(gomp_unmap_tgt): Remove static, add attribute_hidden.
	(gomp_unref_tgt): New function.
	(gomp_unmap_vars_internal): New always_inline function, renamed from
	gomp_unmap_vars.
	(gomp_unmap_vars): Implement by calling gomp_unmap_vars_internal.
	(gomp_unmap_vars_async): Implement by calling
	gomp_unmap_vars_internal, passing goacc_asyncqueue argument.
	(gomp_fini_device): New function.
	(gomp_exit_data): Adjust gomp_copy_dev2host call.
	(gomp_load_plugin_for_device): Remove old interface, adjust to load
	new async interface.
	(gomp_target_fini): Adjust code to call gomp_fini_device.

	* plugin/plugin-nvptx.c (struct cuda_map): Remove.
	(struct ptx_stream): Remove.
	(struct nvptx_thread): Remove current_stream field.
	(cuda_map_create): Remove.
	(cuda_map_destroy): Remove.
	(map_init): Remove.
	(map_fini): Remove.
	(map_pop): Remove.
	(map_push): Remove.
	(struct goacc_asyncqueue): Define.
	(struct nvptx_callback): Define.
	(struct ptx_free_block): Define.
	(struct ptx_device): Remove null_stream, active_streams, async_streams,
	stream_lock, and next fields.
	(enum ptx_event_type): Remove.
	(struct ptx_event): Remove.
	(ptx_event_lock): Remove.
	(ptx_events): Remove.
	(init_streams_for_device): Remove.
	(fini_streams_for_device): Remove.
	(select_stream_for_async): Remove.
	(nvptx_init): Remove ptx_events and ptx_event_lock references.
	(nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
	case.
	(nvptx_open_device): Add free_blocks initialization, remove
	init_streams_for_device call.
	(nvptx_close_device): Remove fini_streams_for_device call, add
	free_blocks destruct code.
	(event_gc): Remove.
	(event_add): Remove.
	(nvptx_exec): Adjust parameters and code.
	(nvptx_free): Likewise.
	(nvptx_host2dev): Remove.
	(nvptx_dev2host): Remove.
	(nvptx_set_async): Remove.
	(nvptx_async_test): Remove.
	(nvptx_async_test_all): Remove.
	(nvptx_wait): Remove.
	(nvptx_wait_async): Remove.
	(nvptx_wait_all): Remove.
	(nvptx_wait_all_async): Remove.
	(nvptx_get_cuda_stream): Remove.
	(nvptx_set_cuda_stream): Remove.
	(GOMP_OFFLOAD_alloc): Adjust code.
	(GOMP_OFFLOAD_free): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
	(GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
	(GOMP_OFFLOAD_openacc_async_test_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_async): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all): Remove.
	(GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
	(GOMP_OFFLOAD_openacc_async_set_async): Remove.
	(cuda_free_argmem): New function.
	(GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
	(GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
	(GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
	(cuda_callback_wrapper): New function.
	(cuda_memcpy_sanity_check): New function.
	(GOMP_OFFLOAD_host2dev): Remove and re-implement.
	(GOMP_OFFLOAD_dev2host): Remove and re-implement.
	(GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
	(GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.

From-SVN: r271128
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index f5fb63c..fa99a2a 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -217,8 +217,6 @@
     }
   va_end (ap);
   
-  acc_dev->openacc.async_set_async_func (async);
-
   if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
     {
       k.host_start = (uintptr_t) fn;
@@ -235,44 +233,29 @@
   else
     tgt_fn = (void (*)) fn;
 
-  tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       GOMP_MAP_VARS_OPENACC);
+  goacc_aq aq = get_goacc_asyncqueue (async);
 
+  tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+			     true, GOMP_MAP_VARS_OPENACC);
+  
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
 			    + tgt->list[i].key->tgt_offset
 			    + tgt->list[i].offset);
-
-  acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
-			      async, dims, tgt);
-
-  /* If running synchronously, unmap immediately.  */
-  bool copyfrom = true;
-  if (async_synchronous_p (async))
-    gomp_unmap_vars (tgt, true);
+  if (aq == NULL)
+    {
+      acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+				  dims, tgt);
+      /* If running synchronously, unmap immediately.  */
+      gomp_unmap_vars (tgt, true);
+    }
   else
     {
-      bool async_unmap = false;
-      for (size_t i = 0; i < tgt->list_count; i++)
-	{
-	  splay_tree_key k = tgt->list[i].key;
-	  if (k && k->refcount == 1)
-	    {
-	      async_unmap = true;
-	      break;
-	    }
-	}
-      if (async_unmap)
-	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
-      else
-	{
-	  copyfrom = false;
-	  gomp_unmap_vars (tgt, copyfrom);
-	}
+      acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+					dims, tgt, aq);
+      gomp_unmap_vars_async (tgt, true, aq);
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* Legacy entry point (GCC 5).  Only provide host fallback execution.  */
@@ -383,8 +366,6 @@
 	finalize = true;
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   /* Determine if this is an "acc enter data".  */
   for (i = 0; i < mapnum; ++i)
     {
@@ -437,11 +418,11 @@
 		{
 		case GOMP_MAP_ALLOC:
 		case GOMP_MAP_FORCE_ALLOC:
-		  acc_create (hostaddrs[i], sizes[i]);
+		  acc_create_async (hostaddrs[i], sizes[i], async);
 		  break;
 		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
-		  acc_copyin (hostaddrs[i], sizes[i]);
+		  acc_copyin_async (hostaddrs[i], sizes[i], async);
 		  break;
 		default:
 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -452,7 +433,7 @@
 	  else
 	    {
 	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
-				       &sizes[i], &kinds[i]);
+				       &sizes[i], &kinds[i], async);
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -477,17 +458,17 @@
 		if (acc_is_present (hostaddrs[i], sizes[i]))
 		  {
 		    if (finalize)
-		      acc_delete_finalize (hostaddrs[i], sizes[i]);
+		      acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
 		    else
-		      acc_delete (hostaddrs[i], sizes[i]);
+		      acc_delete_async (hostaddrs[i], sizes[i], async);
 		  }
 		break;
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_FORCE_FROM:
 		if (finalize)
-		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
+		  acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
 		else
-		  acc_copyout (hostaddrs[i], sizes[i]);
+		  acc_copyout_async (hostaddrs[i], sizes[i], async);
 		break;
 	      default:
 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -505,8 +486,6 @@
 	    i += pointer - 1;
 	  }
       }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 static void
@@ -532,9 +511,10 @@
       if (async == acc_async_sync)
 	acc_wait (qid);
       else if (qid == async)
-	;/* If we're waiting on the same asynchronous queue as we're
-	    launching on, the queue itself will order work as
-	    required, so there's no need to wait explicitly.  */
+	/* If we're waiting on the same asynchronous queue as we're
+	   launching on, the queue itself will order work as
+	   required, so there's no need to wait explicitly.  */
+	;
       else
 	acc_wait_async (qid, async);
     }
@@ -567,8 +547,6 @@
       va_end (ap);
     }
 
-  acc_dev->openacc.async_set_async_func (async);
-
   bool update_device = false;
   for (i = 0; i < mapnum; ++i)
     {
@@ -591,6 +569,8 @@
 		 the value of the allocated device memory in the
 		 previous pointer.  */
 	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
+	      /* TODO: verify that we really cannot use acc_update_device_async
+		 here.  */
 	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
 
 	      /* Restore the host pointer.  */
@@ -608,7 +588,7 @@
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_TO:
 	  update_device = true;
-	  acc_update_device (hostaddrs[i], sizes[i]);
+	  acc_update_device_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	case GOMP_MAP_FROM:
@@ -620,7 +600,7 @@
 	  /* Fallthru  */
 	case GOMP_MAP_FORCE_FROM:
 	  update_device = false;
-	  acc_update_self (hostaddrs[i], sizes[i]);
+	  acc_update_self_async (hostaddrs[i], sizes[i], async);
 	  break;
 
 	default:
@@ -628,8 +608,6 @@
 	  break;
 	}
     }
-
-  acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
 
 void