[nvptx] Basic -misa support for nvptx

Message ID cb3a58c5-2336-6290-cbdb-826fc5137fac@codesourcery.com
State New
Headers show
Series
  • [nvptx] Basic -misa support for nvptx
Related show

Commit Message

Cesar Philippidis Aug. 31, 2018, 2:14 p.m.
Attached is an nvptx patch that adds support for a new, albeit rarely
used, compiler option -misa. At present, there are only two valid ISA
arguments, SM_30 and SM_35. Without that flag, GCC will default to
SM_30. The major advantage of using the SM_35 ISA is to enable the use
PTX atom instructions for __atomic_fetch_{add,and,or,xor} for DI
integers. Without -misa, GCC would use an atomic CAS loop for them. As
an aside, this patch also enables PTX atom instructions for those
aforementioned functions for SI integers.

Is this patch OK for trunk?

Thanks,
Cesar

Comments

Tom de Vries Sept. 1, 2018, 7:04 p.m. | #1
On 08/31/2018 04:14 PM, Cesar Philippidis wrote:
> Attached is an nvptx patch that adds support for a new, albeit rarely

> used, compiler option -misa. At present, there are only two valid ISA

> arguments, SM_30 and SM_35. Without that flag, GCC will default to

> SM_30. The major advantage of using the SM_35 ISA is to enable the use

> PTX atom instructions for __atomic_fetch_{add,and,or,xor} for DI

> integers. Without -misa, GCC would use an atomic CAS loop for them.



> As

> an aside, this patch also enables PTX atom instructions for those

> aforementioned functions for SI integers.


Make the SI part a separate patch, and make sure that one has a
testcase. Pre-approved.

> 

> Is this patch OK for trunk?

> 


Well, how did you test this (
https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and
testing. State the host and target combinations you used to do proper
testing as described above, and the results of your testing.") ?

> Thanks,

> Cesar

> 

> 

> 0001-Basic-misa-support-for-nvptx.patch

> 

> 

> Basic -misa support for nvptx

> 

> 2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>

> 	    Bernd Schmidt  <bernds_cb1@t-online.de>

> 

> 	gcc/

> 	* config/nvptx/nvptx-opts.h: New file.

> 	* config/nvptx/nvptx.c (nvptx_file_start): Print the correct .target.

> 	* config/nvptx/nvptx.h: Include "nvptx-opts.h".

> 	(ASM_SPEC): Define.

> 	(TARGET_SM35): New macro.

> 	* config/nvptx/nvptx.md (atomic_fetch_<logic><mode>): Enable with the

> 	correct predicate.

> 	* config/nvptx/nvptx.opt (ptx_isa, sm_30, sm_35): New enum and its

> 	values.

> 	(misa=): New option.

> 	* doc/invoke.texi (Nvidia PTX Options): Document -misa.

> 

> 	gcc/testsuite/

> 	* gcc.target/nvptx/atomic_fetch-1.c: New test.

> 	* gcc.target/nvptx/atomic_fetch-1.c: New test.

> 

> 

> diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h

> new file mode 100644

> index 00000000000..55d9599917e

> --- /dev/null

> +++ b/gcc/config/nvptx/nvptx-opts.h

> @@ -0,0 +1,30 @@

> +/* Definitions for the NVPTX port needed for option handling.

> +   Copyright (C) 2015-2018 Free Software Foundation, Inc.

> +

> +   This file is part of GCC.

> +

> +   GCC is free software; you can redistribute it and/or modify it

> +   under the terms of the GNU General Public License as published

> +   by the Free Software Foundation; either version 3, or (at your

> +   option) any later version.

> +

> +   GCC is distributed in the hope that it will be useful, but WITHOUT

> +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

> +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public

> +   License for more details.

> +

> +   You should have received a copy of the GNU General Public License

> +   along with GCC; see the file COPYING3.  If not see

> +   <http://www.gnu.org/licenses/>.  */

> +

> +#ifndef NVPTX_OPTS_H

> +#define NVPTX_OPTS_H

> +

> +enum ptx_isa

> +{

> +  PTX_ISA_SM30,

> +  PTX_ISA_SM35

> +};

> +

> +#endif

> +

> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c

> index c0b0a2ec3ab..9903a273863 100644

> --- a/gcc/config/nvptx/nvptx.c

> +++ b/gcc/config/nvptx/nvptx.c

> @@ -4931,7 +4931,10 @@ nvptx_file_start (void)

>  {

>    fputs ("// BEGIN PREAMBLE\n", asm_out_file);

>    fputs ("\t.version\t3.1\n", asm_out_file);

> -  fputs ("\t.target\tsm_30\n", asm_out_file);

> +  if (TARGET_SM35)

> +    fputs ("\t.target\tsm_35\n", asm_out_file);

> +  else

> +    fputs ("\t.target\tsm_30\n", asm_out_file);

>    fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));

>    fputs ("// END PREAMBLE\n", asm_out_file);

>  }

> diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h

> index dfa1e9aa859..a2fe8b68b22 100644

> --- a/gcc/config/nvptx/nvptx.h

> +++ b/gcc/config/nvptx/nvptx.h

> @@ -21,10 +21,16 @@

>  #ifndef GCC_NVPTX_H

>  #define GCC_NVPTX_H

>  

> +#ifndef NVPTX_OPTS_H

> +#include "config/nvptx/nvptx-opts.h"

> +#endif

> +

>  /* Run-time Target.  */

>  

>  #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"

>  

> +#define ASM_SPEC "%{misa=*:-m %*}"

> +

>  #define TARGET_CPU_CPP_BUILTINS()		\

>    do						\

>      {						\

> @@ -87,6 +93,8 @@

>  #define Pmode (TARGET_ABI64 ? DImode : SImode)

>  #define STACK_SIZE_MODE Pmode

>  

> +#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)

> +

>  /* Registers.  Since ptx is a virtual target, we just define a few

>     hard registers for special purposes and leave pseudos unallocated.

>     We have to have some available hard registers, to keep gcc setup

> diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md

> index 2988f5dfa91..ca00b1d8073 100644

> --- a/gcc/config/nvptx/nvptx.md

> +++ b/gcc/config/nvptx/nvptx.md

> @@ -1440,7 +1440,6 @@

>  (define_code_iterator any_logic [and ior xor])

>  (define_code_attr logic [(and "and") (ior "or") (xor "xor")])

>  

> -;; Currently disabled until we add better subtarget support - requires sm_32.

>  (define_insn "atomic_fetch_<logic><mode>"

>    [(set (match_operand:SDIM 1 "memory_operand" "+m")

>  	(unspec_volatile:SDIM

> @@ -1450,7 +1449,7 @@

>  	  UNSPECV_LOCK))

>     (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")

>  	(match_dup 1))]

> -  "0"

> +  "<MODE>mode == SImode || TARGET_SM35"

>    "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"

>    [(set_attr "atomic" "true")])

>  

> diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt

> index 04277d1d98e..8194c0324d6 100644

> --- a/gcc/config/nvptx/nvptx.opt

> +++ b/gcc/config/nvptx/nvptx.opt

> @@ -48,3 +48,17 @@ Generate code that can keep local state uniform across all lanes.

>  mgomp

>  Target Report Mask(GOMP)

>  Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.

> +

> +Enum

> +Name(ptx_isa) Type(int)

> +Known PTX ISA versions (for use with the -misa= option):

> +

> +EnumValue

> +Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)

> +

> +EnumValue

> +Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)

> +

> +misa=

> +Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)

> +Specify the version of the ptx ISA to use.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

> index e37233d6ed4..95bc7d9e509 100644

> --- a/gcc/doc/invoke.texi

> +++ b/gcc/doc/invoke.texi

> @@ -22456,6 +22456,12 @@ These options are defined for Nvidia PTX:

>  @opindex m64

>  Generate code for 32-bit or 64-bit ABI.

>  

> +@item -misa=@var{ISA-string}

> +@opindex march

> +Generate code for given the specified PTX ISA (e.g.@ @samp{sm_35}).  ISA

> +strings must be lower-case.  Valid ISA strings include @samp{sm_30} and

> +@samp{sm_35}.  The default ISA is sm_30.

> +

>  @item -mmainkernel

>  @opindex mmainkernel

>  Link in code for a __main kernel.  This is for stand-alone instead of

> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c

> new file mode 100644

> index 00000000000..1d35a176a62

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c

> @@ -0,0 +1,24 @@

> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_30

> +   targets.  */

> +

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -misa=sm_30" } */

> +

> +int

> +main()

> +{

> +  unsigned long long a = ~0;

> +  unsigned b = 0xa;

> +

> +  __atomic_fetch_add (&a, b, 0);

> +  __atomic_fetch_and (&a, b, 0);

> +  __atomic_fetch_or (&a, b, 0);

> +  __atomic_fetch_xor (&a, b, 0);

> +  

> +  return a;

> +}

> +

> +/* { dg-final { scan-assembler-not "atom.b64.add" } } */

> +/* { dg-final { scan-assembler-not "atom.b64.and" } } */

> +/* { dg-final { scan-assembler-not "atom.b64.or" } } */

> +/* { dg-final { scan-assembler-not "atom.b64.xor" } } */

> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

> new file mode 100644

> index 00000000000..c637caa79a0

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

> @@ -0,0 +1,24 @@

> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35

> +   targets.  */

> +

> +/* { dg-do compile } */

> +/* { dg-options "-O2 -misa=sm_35" } */

> +

> +int

> +main()

> +{

> +  unsigned long long a = ~0;

> +  unsigned b = 0xa;

> +

> +  __atomic_fetch_add (&a, b, 0);

> +  __atomic_fetch_and (&a, b, 0);

> +  __atomic_fetch_or (&a, b, 0);

> +  __atomic_fetch_xor (&a, b, 0);

> +  

> +  return a;

> +}

> +

> +/* { dg-final { scan-assembler "atom.add.u64" } } */

> +/* { dg-final { scan-assembler "atom.b64.and" } } */

> +/* { dg-final { scan-assembler "atom.b64.or" } } */

> +/* { dg-final { scan-assembler "atom.b64.xor" } } */

> -- 2.17.1

> 


Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not
testcase does not use this difference, so that needs to be fixed, or for
bonus points, changed into a scan-assembler testcase).

The documentation uses "op.type", we should fix the compiler to emit
that consistently. Separate patch that fixes that pre-approved.

This is ok (with, as I mentioned above, the SI part split off into a
separate patch), on the condition that you test libgomp with
-foffload=-misa=sm_35.

Thanks,
- Tom
Cesar Philippidis Sept. 2, 2018, 2:57 p.m. | #2
On 09/01/2018 12:04 PM, Tom de Vries wrote:
> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:


>> Is this patch OK for trunk?

>>

> 

> Well, how did you test this (

> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and

> testing. State the host and target combinations you used to do proper

> testing as described above, and the results of your testing.") ?


I tested the standalone nvptx compiler. I'll retest with libgomp with
-misa=sm_35. Bootstrapping won't help much here, unfortunately.
>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

>> @@ -0,0 +1,24 @@

>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35

>> +   targets.  */

>> +

>> +/* { dg-do compile } */

>> +/* { dg-options "-O2 -misa=sm_35" } */

>> +

>> +int

>> +main()

>> +{

>> +  unsigned long long a = ~0;

>> +  unsigned b = 0xa;

>> +

>> +  __atomic_fetch_add (&a, b, 0);

>> +  __atomic_fetch_and (&a, b, 0);

>> +  __atomic_fetch_or (&a, b, 0);

>> +  __atomic_fetch_xor (&a, b, 0);

>> +  

>> +  return a;

>> +}

>> +

>> +/* { dg-final { scan-assembler "atom.add.u64" } } */

>> +/* { dg-final { scan-assembler "atom.b64.and" } } */

>> +/* { dg-final { scan-assembler "atom.b64.or" } } */

>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */

>> -- 2.17.1

>>

> 

> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not

> testcase does not use this difference, so that needs to be fixed, or for

> bonus points, changed into a scan-assembler testcase).

> 

> The documentation uses "op.type", we should fix the compiler to emit

> that consistently. Separate patch that fixes that pre-approved.


ACK. I think there are a lot of other cases like that in the BE.

> This is ok (with, as I mentioned above, the SI part split off into a

> separate patch), on the condition that you test libgomp with

> -foffload=-misa=sm_35.


Thanks,
Cesar
Cesar Philippidis Sept. 4, 2018, 10:19 p.m. | #3
On 09/02/2018 07:57 AM, Cesar Philippidis wrote:
> On 09/01/2018 12:04 PM, Tom de Vries wrote:

>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:

> 

>>> Is this patch OK for trunk?

>>>

>>

>> Well, how did you test this (

>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and

>> testing. State the host and target combinations you used to do proper

>> testing as described above, and the results of your testing.") ?

> 

> I tested the standalone nvptx compiler. I'll retest with libgomp with

> -misa=sm_35. Bootstrapping won't help much here, unfortunately.

>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

>>> @@ -0,0 +1,24 @@

>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35

>>> +   targets.  */

>>> +

>>> +/* { dg-do compile } */

>>> +/* { dg-options "-O2 -misa=sm_35" } */

>>> +

>>> +int

>>> +main()

>>> +{

>>> +  unsigned long long a = ~0;

>>> +  unsigned b = 0xa;

>>> +

>>> +  __atomic_fetch_add (&a, b, 0);

>>> +  __atomic_fetch_and (&a, b, 0);

>>> +  __atomic_fetch_or (&a, b, 0);

>>> +  __atomic_fetch_xor (&a, b, 0);

>>> +  

>>> +  return a;

>>> +}

>>> +

>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */

>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */

>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */

>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */

>>> -- 2.17.1

>>>

>>

>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not

>> testcase does not use this difference, so that needs to be fixed, or for

>> bonus points, changed into a scan-assembler testcase).

>>

>> The documentation uses "op.type", we should fix the compiler to emit

>> that consistently. Separate patch that fixes that pre-approved.

> 

> ACK. I think there are a lot of other cases like that in the BE.

> 

>> This is ok (with, as I mentioned above, the SI part split off into a

>> separate patch), on the condition that you test libgomp with

>> -foffload=-misa=sm_35.


Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't
support the -misa flag. When I forced the nvptx BE to set TARGET_SM35 to
always be true, I ran into problems with SM_30 code linking against
SM_35 code. Therefore, I don't think this patch is ready for trunk yet.

By the way, is -misa really necessary for atomic_fetch_<logic><mode>?
Looking at the PTX documentation I see
<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>:

PTX ISA version 3.1 introduces the following new features:

* Support for sm_35 target architecture.
* Extends atomic and reduction instructions to perform 64-bit {and, or,
xor} operations, and 64-bit integer {min, max} operations.

Is there a table for which list which GPUs are compatible with which
instructions?

Thanks,
Cesar
Tom de Vries Sept. 5, 2018, 2:30 p.m. | #4
On 09/05/2018 12:19 AM, Cesar Philippidis wrote:
> On 09/02/2018 07:57 AM, Cesar Philippidis wrote:

>> On 09/01/2018 12:04 PM, Tom de Vries wrote:

>>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:

>>

>>>> Is this patch OK for trunk?

>>>>

>>>

>>> Well, how did you test this (

>>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and

>>> testing. State the host and target combinations you used to do proper

>>> testing as described above, and the results of your testing.") ?

>>

>> I tested the standalone nvptx compiler. I'll retest with libgomp with

>> -misa=sm_35. Bootstrapping won't help much here, unfortunately.

>>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

>>>> @@ -0,0 +1,24 @@

>>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35

>>>> +   targets.  */

>>>> +

>>>> +/* { dg-do compile } */

>>>> +/* { dg-options "-O2 -misa=sm_35" } */

>>>> +

>>>> +int

>>>> +main()

>>>> +{

>>>> +  unsigned long long a = ~0;

>>>> +  unsigned b = 0xa;

>>>> +

>>>> +  __atomic_fetch_add (&a, b, 0);

>>>> +  __atomic_fetch_and (&a, b, 0);

>>>> +  __atomic_fetch_or (&a, b, 0);

>>>> +  __atomic_fetch_xor (&a, b, 0);

>>>> +  

>>>> +  return a;

>>>> +}

>>>> +

>>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */

>>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */

>>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */

>>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */

>>>> -- 2.17.1

>>>>

>>>

>>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not

>>> testcase does not use this difference, so that needs to be fixed, or for

>>> bonus points, changed into a scan-assembler testcase).

>>>

>>> The documentation uses "op.type", we should fix the compiler to emit

>>> that consistently. Separate patch that fixes that pre-approved.

>>

>> ACK. I think there are a lot of other cases like that in the BE.

>>

>>> This is ok (with, as I mentioned above, the SI part split off into a

>>> separate patch), on the condition that you test libgomp with

>>> -foffload=-misa=sm_35.

> 

> Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't

> support the -misa flag.


That doesn't make sense to me. For me this works without any problems.
Have you tried a clean build?

> When I forced the nvptx BE to set TARGET_SM35 to

> always be true, I ran into problems with SM_30 code linking against

> SM_35 code.


I also cannot reproduce this, works for me.

> Therefore, I don't think this patch is ready for trunk yet.

>> By the way, is -misa really necessary for atomic_fetch_<logic><mode>?

> Looking at the PTX documentation I see

> <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>:

> 

> PTX ISA version 3.1 introduces the following new features:

> 

> * Support for sm_35 target architecture.

> * Extends atomic and reduction instructions to perform 64-bit {and, or,

> xor} operations, and 64-bit integer {min, max} operations.

> 

> Is there a table for which list which GPUs are compatible with which

> instructions?


Yes, every instruction has a table in the ptx manual, and there's a "PTX
ISA Notes" entry.

For the atom instruction in ptx isa 3.1 manual, we have "PTX ISA Notes":
...
atom.global requires sm_11 or higher.
atom.shared requires sm_12 or higher.
64-bit atom.global.{add,cas,exch} require sm_12 or higher.
64-bit atom.shared.{add,cas,exch} require sm_20 or higher.
64-bit atom.{and,or.xor,min,max} require sm_35 or higher.
atom.add.f32 requires sm_20 or higher.
Use of generic addressing requires sm_20 or higher.
...

Thanks,
- Tom
Cesar Philippidis Sept. 5, 2018, 10:24 p.m. | #5
On 09/05/2018 07:30 AM, Tom de Vries wrote:
> On 09/05/2018 12:19 AM, Cesar Philippidis wrote:

>> On 09/02/2018 07:57 AM, Cesar Philippidis wrote:

>>> On 09/01/2018 12:04 PM, Tom de Vries wrote:

>>>> On 08/31/2018 04:14 PM, Cesar Philippidis wrote:

>>>

>>>>> Is this patch OK for trunk?

>>>>>

>>>>

>>>> Well, how did you test this (

>>>> https://gcc.gnu.org/contribute.html#patches : "Bootstrapping and

>>>> testing. State the host and target combinations you used to do proper

>>>> testing as described above, and the results of your testing.") ?

>>>

>>> I tested the standalone nvptx compiler. I'll retest with libgomp with

>>> -misa=sm_35. Bootstrapping won't help much here, unfortunately.

>>>>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

>>>>> @@ -0,0 +1,24 @@

>>>>> +/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35

>>>>> +   targets.  */

>>>>> +

>>>>> +/* { dg-do compile } */

>>>>> +/* { dg-options "-O2 -misa=sm_35" } */

>>>>> +

>>>>> +int

>>>>> +main()

>>>>> +{

>>>>> +  unsigned long long a = ~0;

>>>>> +  unsigned b = 0xa;

>>>>> +

>>>>> +  __atomic_fetch_add (&a, b, 0);

>>>>> +  __atomic_fetch_and (&a, b, 0);

>>>>> +  __atomic_fetch_or (&a, b, 0);

>>>>> +  __atomic_fetch_xor (&a, b, 0);

>>>>> +  

>>>>> +  return a;

>>>>> +}

>>>>> +

>>>>> +/* { dg-final { scan-assembler "atom.add.u64" } } */

>>>>> +/* { dg-final { scan-assembler "atom.b64.and" } } */

>>>>> +/* { dg-final { scan-assembler "atom.b64.or" } } */

>>>>> +/* { dg-final { scan-assembler "atom.b64.xor" } } */

>>>>> -- 2.17.1

>>>>>

>>>>

>>>> Hmm, the add.u64 vs b64.and looks odd (and the scan-assembler-not

>>>> testcase does not use this difference, so that needs to be fixed, or for

>>>> bonus points, changed into a scan-assembler testcase).

>>>>

>>>> The documentation uses "op.type", we should fix the compiler to emit

>>>> that consistently. Separate patch that fixes that pre-approved.

>>>

>>> ACK. I think there are a lot of other cases like that in the BE.

>>>

>>>> This is ok (with, as I mentioned above, the SI part split off into a

>>>> separate patch), on the condition that you test libgomp with

>>>> -foffload=-misa=sm_35.

>>

>> Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't

>> support the -misa flag.

> 

> That doesn't make sense to me. For me this works without any problems.

> Have you tried a clean build?


I was incorrectly setting ALWAYS_CFLAGS to use -foffload=-misa=sm_35.
That didn't work on the host. But lappend'ing tagopt did work.

>> When I forced the nvptx BE to set TARGET_SM35 to

>> always be true, I ran into problems with SM_30 code linking against

>> SM_35 code.

> 

> I also cannot reproduce this, works for me.


I found the problem. I wasn't using a clean build. Besides, with the
tagopt change in libgomp, I didn't need to force the -misa=sm_35 flag
everywhere.

>> Therefore, I don't think this patch is ready for trunk yet.

>>> By the way, is -misa really necessary for atomic_fetch_<logic><mode>?

>> Looking at the PTX documentation I see

>> <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#changes-in-ptx-isa-version-3-1>:

>>

>> PTX ISA version 3.1 introduces the following new features:

>>

>> * Support for sm_35 target architecture.

>> * Extends atomic and reduction instructions to perform 64-bit {and, or,

>> xor} operations, and 64-bit integer {min, max} operations.

>>

>> Is there a table for which list which GPUs are compatible with which

>> instructions?

> 

> Yes, every instruction has a table in the ptx manual, and there's a "PTX

> ISA Notes" entry.

> 

> For the atom instruction in ptx isa 3.1 manual, we have "PTX ISA Notes":

> ...

> atom.global requires sm_11 or higher.

> atom.shared requires sm_12 or higher.

> 64-bit atom.global.{add,cas,exch} require sm_12 or higher.

> 64-bit atom.shared.{add,cas,exch} require sm_20 or higher.

> 64-bit atom.{and,or.xor,min,max} require sm_35 or higher.

> atom.add.f32 requires sm_20 or higher.

> Use of generic addressing requires sm_20 or higher.

> ...


Thanks!

I'll commit the attached patch shortly. x86_64 with nvptx offloading
regression testing didn't yield any new failures, nor did the standalone
nvptx testing. I'll follow up with an SImode patch later.

Cesar
[nvptx] Basic -misa support for nvptx

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>
	    Bernd Schmidt  <bernds_cb1@t-online.de>

	gcc/
	* config/nvptx/nvptx-opts.h: New file.
	* config/nvptx/nvptx.c (nvptx_file_start): Print the correct .target.
	* config/nvptx/nvptx.h: Include "nvptx-opts.h".
	(ASM_SPEC): Define.
	(TARGET_SM35): New macro.
	* config/nvptx/nvptx.md (atomic_fetch_<logic><mode>): Enable with the
	correct predicate.
	* config/nvptx/nvptx.opt (ptx_isa, sm_30, sm_35): New enum and its
	values.
	(misa=): New option.
	* doc/invoke.texi (Nvidia PTX Options): Document -misa.

	gcc/testsuite/
	* gcc.target/nvptx/atomic_fetch-1.c: New test.
	* gcc.target/nvptx/atomic_fetch-1.c: New test.

(cherry picked from gomp-4_0-branch r223182)
---
 gcc/config/nvptx/nvptx-opts.h                 | 30 +++++++++++++++++++
 gcc/config/nvptx/nvptx.c                      |  5 +++-
 gcc/config/nvptx/nvptx.h                      |  8 +++++
 gcc/config/nvptx/nvptx.md                     |  3 +-
 gcc/config/nvptx/nvptx.opt                    | 14 +++++++++
 gcc/doc/invoke.texi                           |  6 ++++
 .../gcc.target/nvptx/atomic-fetch-2.c         | 24 +++++++++++++++
 .../gcc.target/nvptx/atomic_fetch-1.c         | 24 +++++++++++++++
 8 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/nvptx/nvptx-opts.h
 create mode 100644 gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
 create mode 100644 gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c

diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
new file mode 100644
index 00000000000..55d9599917e
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -0,0 +1,30 @@
+/* Definitions for the NVPTX port needed for option handling.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NVPTX_OPTS_H
+#define NVPTX_OPTS_H
+
+enum ptx_isa
+{
+  PTX_ISA_SM30,
+  PTX_ISA_SM35
+};
+
+#endif
+
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index c0b0a2ec3ab..9903a273863 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4931,7 +4931,10 @@ nvptx_file_start (void)
 {
   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
   fputs ("\t.version\t3.1\n", asm_out_file);
-  fputs ("\t.target\tsm_30\n", asm_out_file);
+  if (TARGET_SM35)
+    fputs ("\t.target\tsm_35\n", asm_out_file);
+  else
+    fputs ("\t.target\tsm_30\n", asm_out_file);
   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index dfa1e9aa859..a2fe8b68b22 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -21,10 +21,16 @@
 #ifndef GCC_NVPTX_H
 #define GCC_NVPTX_H
 
+#ifndef NVPTX_OPTS_H
+#include "config/nvptx/nvptx-opts.h"
+#endif
+
 /* Run-time Target.  */
 
 #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"
 
+#define ASM_SPEC "%{misa=*:-m %*}"
+
 #define TARGET_CPU_CPP_BUILTINS()		\
   do						\
     {						\
@@ -87,6 +93,8 @@
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
 #define STACK_SIZE_MODE Pmode
 
+#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
+
 /* Registers.  Since ptx is a virtual target, we just define a few
    hard registers for special purposes and leave pseudos unallocated.
    We have to have some available hard registers, to keep gcc setup
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 2988f5dfa91..dd6032d021b 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1440,7 +1440,6 @@
 (define_code_iterator any_logic [and ior xor])
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
-;; Currently disabled until we add better subtarget support - requires sm_32.
 (define_insn "atomic_fetch_<logic><mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
@@ -1450,7 +1449,7 @@
 	  UNSPECV_LOCK))
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
-  "0"
+  "TARGET_SM35"
   "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
   [(set_attr "atomic" "true")])
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 04277d1d98e..8194c0324d6 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -48,3 +48,17 @@ Generate code that can keep local state uniform across all lanes.
 mgomp
 Target Report Mask(GOMP)
 Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
+
+Enum
+Name(ptx_isa) Type(int)
+Known PTX ISA versions (for use with the -misa= option):
+
+EnumValue
+Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
+
+EnumValue
+Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
+
+misa=
+Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)
+Specify the version of the ptx ISA to use.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e37233d6ed4..95bc7d9e509 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22456,6 +22456,12 @@ These options are defined for Nvidia PTX:
 @opindex m64
 Generate code for 32-bit or 64-bit ABI.
 
+@item -misa=@var{ISA-string}
+@opindex march
+Generate code for given the specified PTX ISA (e.g.@ @samp{sm_35}).  ISA
+strings must be lower-case.  Valid ISA strings include @samp{sm_30} and
+@samp{sm_35}.  The default ISA is sm_30.
+
 @item -mmainkernel
 @opindex mmainkernel
 Link in code for a __main kernel.  This is for stand-alone instead of
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
new file mode 100644
index 00000000000..1d35a176a62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
@@ -0,0 +1,24 @@
+/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_30
+   targets.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -misa=sm_30" } */
+
+int
+main()
+{
+  unsigned long long a = ~0;
+  unsigned b = 0xa;
+
+  __atomic_fetch_add (&a, b, 0);
+  __atomic_fetch_and (&a, b, 0);
+  __atomic_fetch_or (&a, b, 0);
+  __atomic_fetch_xor (&a, b, 0);
+  
+  return a;
+}
+
+/* { dg-final { scan-assembler-not "atom.b64.add" } } */
+/* { dg-final { scan-assembler-not "atom.b64.and" } } */
+/* { dg-final { scan-assembler-not "atom.b64.or" } } */
+/* { dg-final { scan-assembler-not "atom.b64.xor" } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
new file mode 100644
index 00000000000..c637caa79a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
@@ -0,0 +1,24 @@
+/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35
+   targets.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -misa=sm_35" } */
+
+int
+main()
+{
+  unsigned long long a = ~0;
+  unsigned b = 0xa;
+
+  __atomic_fetch_add (&a, b, 0);
+  __atomic_fetch_and (&a, b, 0);
+  __atomic_fetch_or (&a, b, 0);
+  __atomic_fetch_xor (&a, b, 0);
+  
+  return a;
+}
+
+/* { dg-final { scan-assembler "atom.add.u64" } } */
+/* { dg-final { scan-assembler "atom.b64.and" } } */
+/* { dg-final { scan-assembler "atom.b64.or" } } */
+/* { dg-final { scan-assembler "atom.b64.xor" } } */
-- 
2.17.1
Tom de Vries Sept. 6, 2018, 6:34 a.m. | #6
On 09/06/2018 12:24 AM, Cesar Philippidis wrote:
> I'll commit the attached patch shortly. x86_64 with nvptx offloading

> regression testing didn't yield any new failures, nor did the standalone

> nvptx testing. I'll follow up with an SImode patch later.


I'm sorry, I guess I was not clear enough here because it's obvious to
me. First apply the SImode patch (enabling the simple case), then follow
up with the DImode patch (enabling the more complex case).

Thanks,
- Tom
Tom de Vries Sept. 6, 2018, 6:50 a.m. | #7
On 09/06/2018 12:24 AM, Cesar Philippidis wrote:
>>>>> This is ok (with, as I mentioned above, the SI part split off into a

>>>>> separate patch), on the condition that you test libgomp with

>>>>> -foffload=-misa=sm_35.


>>> Adding -foffload=misa=sm_35 didn't work because the host gcc doesn't

>>> support the -misa flag.


>> That doesn't make sense to me. For me this works without any problems.

>> Have you tried a clean build?


> I was incorrectly setting ALWAYS_CFLAGS to use -foffload=-misa=sm_35.

> That didn't work on the host. 


That's the approach I used, and it worked for me without any problems.

Why do you think that approach is incorrect?

Thanks,
- Tom

Patch

Basic -misa support for nvptx

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>
	    Bernd Schmidt  <bernds_cb1@t-online.de>

	gcc/
	* config/nvptx/nvptx-opts.h: New file.
	* config/nvptx/nvptx.c (nvptx_file_start): Print the correct .target.
	* config/nvptx/nvptx.h: Include "nvptx-opts.h".
	(ASM_SPEC): Define.
	(TARGET_SM35): New macro.
	* config/nvptx/nvptx.md (atomic_fetch_<logic><mode>): Enable with the
	correct predicate.
	* config/nvptx/nvptx.opt (ptx_isa, sm_30, sm_35): New enum and its
	values.
	(misa=): New option.
	* doc/invoke.texi (Nvidia PTX Options): Document -misa.

	gcc/testsuite/
	* gcc.target/nvptx/atomic_fetch-1.c: New test.
	* gcc.target/nvptx/atomic_fetch-1.c: New test.


diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h
new file mode 100644
index 00000000000..55d9599917e
--- /dev/null
+++ b/gcc/config/nvptx/nvptx-opts.h
@@ -0,0 +1,30 @@ 
+/* Definitions for the NVPTX port needed for option handling.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NVPTX_OPTS_H
+#define NVPTX_OPTS_H
+
+enum ptx_isa
+{
+  PTX_ISA_SM30,
+  PTX_ISA_SM35
+};
+
+#endif
+
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index c0b0a2ec3ab..9903a273863 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4931,7 +4931,10 @@  nvptx_file_start (void)
 {
   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
   fputs ("\t.version\t3.1\n", asm_out_file);
-  fputs ("\t.target\tsm_30\n", asm_out_file);
+  if (TARGET_SM35)
+    fputs ("\t.target\tsm_35\n", asm_out_file);
+  else
+    fputs ("\t.target\tsm_30\n", asm_out_file);
   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
   fputs ("// END PREAMBLE\n", asm_out_file);
 }
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index dfa1e9aa859..a2fe8b68b22 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -21,10 +21,16 @@ 
 #ifndef GCC_NVPTX_H
 #define GCC_NVPTX_H
 
+#ifndef NVPTX_OPTS_H
+#include "config/nvptx/nvptx-opts.h"
+#endif
+
 /* Run-time Target.  */
 
 #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"
 
+#define ASM_SPEC "%{misa=*:-m %*}"
+
 #define TARGET_CPU_CPP_BUILTINS()		\
   do						\
     {						\
@@ -87,6 +93,8 @@ 
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
 #define STACK_SIZE_MODE Pmode
 
+#define TARGET_SM35 (ptx_isa_option >= PTX_ISA_SM35)
+
 /* Registers.  Since ptx is a virtual target, we just define a few
    hard registers for special purposes and leave pseudos unallocated.
    We have to have some available hard registers, to keep gcc setup
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 2988f5dfa91..ca00b1d8073 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1440,7 +1440,6 @@ 
 (define_code_iterator any_logic [and ior xor])
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
-;; Currently disabled until we add better subtarget support - requires sm_32.
 (define_insn "atomic_fetch_<logic><mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
@@ -1450,7 +1449,7 @@ 
 	  UNSPECV_LOCK))
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
-  "0"
+  "<MODE>mode == SImode || TARGET_SM35"
   "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
   [(set_attr "atomic" "true")])
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 04277d1d98e..8194c0324d6 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -48,3 +48,17 @@  Generate code that can keep local state uniform across all lanes.
 mgomp
 Target Report Mask(GOMP)
 Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
+
+Enum
+Name(ptx_isa) Type(int)
+Known PTX ISA versions (for use with the -misa= option):
+
+EnumValue
+Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
+
+EnumValue
+Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
+
+misa=
+Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)
+Specify the version of the ptx ISA to use.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e37233d6ed4..95bc7d9e509 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22456,6 +22456,12 @@  These options are defined for Nvidia PTX:
 @opindex m64
 Generate code for 32-bit or 64-bit ABI.
 
+@item -misa=@var{ISA-string}
+@opindex march
+Generate code for given the specified PTX ISA (e.g.@ @samp{sm_35}).  ISA
+strings must be lower-case.  Valid ISA strings include @samp{sm_30} and
+@samp{sm_35}.  The default ISA is sm_30.
+
 @item -mmainkernel
 @opindex mmainkernel
 Link in code for a __main kernel.  This is for stand-alone instead of
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
new file mode 100644
index 00000000000..1d35a176a62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic-fetch-2.c
@@ -0,0 +1,24 @@ 
+/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_30
+   targets.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -misa=sm_30" } */
+
+int
+main()
+{
+  unsigned long long a = ~0;
+  unsigned b = 0xa;
+
+  __atomic_fetch_add (&a, b, 0);
+  __atomic_fetch_and (&a, b, 0);
+  __atomic_fetch_or (&a, b, 0);
+  __atomic_fetch_xor (&a, b, 0);
+  
+  return a;
+}
+
+/* { dg-final { scan-assembler-not "atom.b64.add" } } */
+/* { dg-final { scan-assembler-not "atom.b64.and" } } */
+/* { dg-final { scan-assembler-not "atom.b64.or" } } */
+/* { dg-final { scan-assembler-not "atom.b64.xor" } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
new file mode 100644
index 00000000000..c637caa79a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic_fetch-1.c
@@ -0,0 +1,24 @@ 
+/* Test the nvptx atomic instructions for __atomic_fetch_OP for SM_35
+   targets.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -misa=sm_35" } */
+
+int
+main()
+{
+  unsigned long long a = ~0;
+  unsigned b = 0xa;
+
+  __atomic_fetch_add (&a, b, 0);
+  __atomic_fetch_and (&a, b, 0);
+  __atomic_fetch_or (&a, b, 0);
+  __atomic_fetch_xor (&a, b, 0);
+  
+  return a;
+}
+
+/* { dg-final { scan-assembler "atom.add.u64" } } */
+/* { dg-final { scan-assembler "atom.b64.and" } } */
+/* { dg-final { scan-assembler "atom.b64.or" } } */
+/* { dg-final { scan-assembler "atom.b64.xor" } } */
-- 
2.17.1