[AARCH64] Use STLUR for atomic_store

Message ID DB6PR0801MB20060642A91016A2B20BDADFE02C0@DB6PR0801MB2006.eurprd08.prod.outlook.com
State New
Headers show
Series
  • [AARCH64] Use STLUR for atomic_store
Related show

Commit Message

Matthew Malcomson Aug. 2, 2018, 4:26 p.m.
Use the STLUR instruction introduced in Armv8.4-a.
This insruction has the store-release semantic like STLR but can take a
9-bit unscaled signed immediate offset.

Example test case:
```
void
foo ()
{
    int32_t *atomic_vals = calloc (4, sizeof (int32_t));
    atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);
}
```

Before patch generates
```
foo:
	stp	x29, x30, [sp, -16]!
	mov	x1, 4
	mov	x0, x1
	mov	x29, sp
	bl	calloc
	mov	w1, 2
	add	x0, x0, 4
	stlr	w1, [x0]
	ldp	x29, x30, [sp], 16
	ret
```

After patch generates
```
foo:
	stp	x29, x30, [sp, -16]!
	mov	x1, 4
	mov	x0, x1
	mov	x29, sp
	bl	calloc
	mov	w1, 2
	stlur	w1, [x0, 4]
	ldp	x29, x30, [sp], 16
	ret
```

Full bootstrap and regression test done on aarch64.

Ok for trunk?

gcc/
2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

        * config/aarch64/aarch64-protos.h
        (aarch64_offset_9bit_signed_unscaled_p): New declaration.
        * config/aarch64/aarch64.c
        (aarch64_offset_9bit_signed_unscaled_p): Rename from
        offset_9bit_signed_unscaled_p.
        * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.
        * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
        and use stlur.
        * config/aarch64/constraints.md (Ust): New constraint.
        * config/aarch64/predicates.md.
        (aarch64_sync_or_stlur_memory_operand): New predicate.

gcc/testsuite/
2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

	* gcc.target/aarch64/atomic-store.c: New.

Comments

Sudakshina Das Aug. 3, 2018, 12:18 p.m. | #1
Hi Matthew

On 02/08/18 17:26, matthew.malcomson@arm.com wrote:
> Use the STLUR instruction introduced in Armv8.4-a.

> This insruction has the store-release semantic like STLR but can take a

> 9-bit unscaled signed immediate offset.

> 

> Example test case:

> ```

> void

> foo ()

> {

>      int32_t *atomic_vals = calloc (4, sizeof (int32_t));

>      atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);

> }

> ```

> 

> Before patch generates

> ```

> foo:

> 	stp	x29, x30, [sp, -16]!

> 	mov	x1, 4

> 	mov	x0, x1

> 	mov	x29, sp

> 	bl	calloc

> 	mov	w1, 2

> 	add	x0, x0, 4

> 	stlr	w1, [x0]

> 	ldp	x29, x30, [sp], 16

> 	ret

> ```

> 

> After patch generates

> ```

> foo:

> 	stp	x29, x30, [sp, -16]!

> 	mov	x1, 4

> 	mov	x0, x1

> 	mov	x29, sp

> 	bl	calloc

> 	mov	w1, 2

> 	stlur	w1, [x0, 4]

> 	ldp	x29, x30, [sp], 16

> 	ret

> ```

> 

> Full bootstrap and regression test done on aarch64.

> 

> Ok for trunk?

> 

> gcc/

> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

> 

>          * config/aarch64/aarch64-protos.h

>          (aarch64_offset_9bit_signed_unscaled_p): New declaration.

>          * config/aarch64/aarch64.c

>          (aarch64_offset_9bit_signed_unscaled_p): Rename from

>          offset_9bit_signed_unscaled_p.

>          * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.

>          * config/aarch64/atomics.md (atomic_store<mode>): Allow offset

>          and use stlur.

>          * config/aarch64/constraints.md (Ust): New constraint.

>          * config/aarch64/predicates.md.

>          (aarch64_sync_or_stlur_memory_operand): New predicate.

> 

> gcc/testsuite/

> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

> 

> 	* gcc.target/aarch64/atomic-store.c: New.

> 


Thank you for doing this. I am not a maintainer but I have a few nits on
this patch:

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
af5db9c595385f7586692258f750b6aceb3ed9c8..630a75bf776fcdc374aa9ffa4bb020fea3719320 
100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, 
rtx, rtx, rtx, rtx);
  bool aarch64_mov_operand_p (rtx, machine_mode);
...
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
  			       poly_int64 offset)

This needs to be aligned with the first argument

...

@@ -5837,7 +5837,7 @@ aarch64_classify_address (struct 
aarch64_address_info *info,
  	     ldr/str instructions (only big endian will get here).  */
  	  if (mode == CImode)
  	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, offset + 32)

This is not less that 80 characters

...

+;; STLUR instruction constraint requires Armv8.4
+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address suitable for use with an stlur instruction."
+  (and (match_operand 0 "aarch64_sync_or_stlur_memory_operand")
+       (match_test "TARGET_ARMV8_4")))
+

You are already checking for TARGET_ARMV8_4 inside
aarch64_sync_or_stlur_memory_operand. Also see my comment below for this
function.

...

+;; True if the operand is memory reference valid for one of a str or stlur
+;; operation.
+(define_predicate "aarch64_sync_or_stlur_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (and (match_operand 0 "memory_operand")
+	    (match_code "plus" "0")
+	    (match_code "reg" "00")
+	    (match_code "const_int" "01")))
+{
+  if (aarch64_sync_memory_operand (op, mode))
+    return true;
+
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  rtx mem_op = XEXP (op, 0);
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  poly_int_rtx_p (plus_op1, &offset);
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+

This predicate body makes it a bit mixed up with the two type of
operands that you want to test especially looking at it from the
constraint check perspective. I am assuming you would not want to use
the non-immediate form of stlur and instead only use it in the
form:
STLUR <Wt>, [<Xn|SP>, #<simm>]
and use stlr for no immediate alternative. Thus the constraint does not
need to check for aarch64_sync_memory_operand.

My suggestion would be to make this operand check separate. Something
like:

+(define_predicate "aarch64_sync_or_stlur_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (match_operand 0 "aarch64_stlur_memory_operand")))

Where you define aarch64_stlur_memory_operand as

+bool aarch64_stlur_memory_operand (rtx op)
+{
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  rtx mem_op = XEXP (op, 0);
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  poly_int_rtx_p (plus_op1, &offset);
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+}

and use the same for the constraint.

+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address suitable for use with an stlur instruction."
+  (match_operand 0 "aarch64_stlur_memory_operand")

...

diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c 
b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0ffc88029bbfe748cd0fc3b6ff4fdd47373b0bb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
...
+void
+foo_toolarge_offset ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  /* 9bit signed unscaled immediate => largest representable value +255.
+     32 * 8 == 256  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+}

Maybe its a good idea to check for the largest representable value and
the smallest representable value? Update this test to
foo_boundary_offset() and check for both sides of the two boundary?

Thanks
Sudi
Richard Earnshaw (lists) Aug. 3, 2018, 2:56 p.m. | #2
On 02/08/18 17:26, matthew.malcomson@arm.com wrote:
> Use the STLUR instruction introduced in Armv8.4-a.

> This insruction has the store-release semantic like STLR but can take a

> 9-bit unscaled signed immediate offset.

> 

> Example test case:

> ```

> void

> foo ()

> {

>     int32_t *atomic_vals = calloc (4, sizeof (int32_t));

>     atomic_store_explicit (atomic_vals + 1, 2, memory_order_release);

> }

> ```

> 

> Before patch generates

> ```

> foo:

> 	stp	x29, x30, [sp, -16]!

> 	mov	x1, 4

> 	mov	x0, x1

> 	mov	x29, sp

> 	bl	calloc

> 	mov	w1, 2

> 	add	x0, x0, 4

> 	stlr	w1, [x0]

> 	ldp	x29, x30, [sp], 16

> 	ret

> ```

> 

> After patch generates

> ```

> foo:

> 	stp	x29, x30, [sp, -16]!

> 	mov	x1, 4

> 	mov	x0, x1

> 	mov	x29, sp

> 	bl	calloc

> 	mov	w1, 2

> 	stlur	w1, [x0, 4]

> 	ldp	x29, x30, [sp], 16

> 	ret

> ```

> 

> Full bootstrap and regression test done on aarch64.

> 

> Ok for trunk?

> 

> gcc/

> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

> 

>         * config/aarch64/aarch64-protos.h

>         (aarch64_offset_9bit_signed_unscaled_p): New declaration.

>         * config/aarch64/aarch64.c

>         (aarch64_offset_9bit_signed_unscaled_p): Rename from

>         offset_9bit_signed_unscaled_p.

>         * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.

>         * config/aarch64/atomics.md (atomic_store<mode>): Allow offset

>         and use stlur.

>         * config/aarch64/constraints.md (Ust): New constraint.

>         * config/aarch64/predicates.md.

>         (aarch64_sync_or_stlur_memory_operand): New predicate.


Thinking further ahead, there were several new instructions added that
have the same basic form; not all of them are STLUR.  At some point I
expect we will want to add support for those into GCC.  So I think you
should consider a more generic name, perhaps
aarch64_sync_offset_memory_operand?

R.

> 

> gcc/testsuite/

> 2018-07-26  Matthew Malcomson  <matthew.malcomson@arm.com>

> 

> 	* gcc.target/aarch64/atomic-store.c: New.

> 

> 

> use-stlur-instruction.patch

> 

> 

> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

> index af5db9c595385f7586692258f750b6aceb3ed9c8..630a75bf776fcdc374aa9ffa4bb020fea3719320 100644

> --- a/gcc/config/aarch64/aarch64-protos.h

> +++ b/gcc/config/aarch64/aarch64-protos.h

> @@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);

>  bool aarch64_mov_operand_p (rtx, machine_mode);

>  rtx aarch64_reverse_mask (machine_mode, unsigned int);

>  bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);

> +bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);

>  char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);

>  char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);

>  char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);

> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

> index c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 100644

> --- a/gcc/config/aarch64/aarch64.h

> +++ b/gcc/config/aarch64/aarch64.h

> @@ -237,6 +237,9 @@ extern unsigned aarch64_architecture_version;

>  /* ARMv8.3-A features.  */

>  #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)

>  

> +/* ARMv8.4-A features.  */

> +#define TARGET_ARMV8_4	(AARCH64_ISA_V8_4)

> +

>  /* Make sure this is always defined so we don't have to check for ifdefs

>     but rather use normal ifs.  */

>  #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

> index fa01475aa9ee579b6a3b2526295b622157120660..8560150d552b4f830335ccd420a0749f01d4bb6a 100644

> --- a/gcc/config/aarch64/aarch64.c

> +++ b/gcc/config/aarch64/aarch64.c

> @@ -4546,8 +4546,8 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)

>  

>  /* Return true if OFFSET is a signed 9-bit value.  */

>  

> -static inline bool

> -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,

> +bool

> +aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,

>  			       poly_int64 offset)

>  {

>    HOST_WIDE_INT const_offset;

> @@ -5823,7 +5823,7 @@ aarch64_classify_address (struct aarch64_address_info *info,

>  	     instruction memory accesses.  */

>  	  if (mode == TImode || mode == TFmode)

>  	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)

> -		    && (offset_9bit_signed_unscaled_p (mode, offset)

> +		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)

>  			|| offset_12bit_unsigned_scaled_p (mode, offset)));

>  

>  	  /* A 7bit offset check because OImode will emit a ldp/stp

> @@ -5837,7 +5837,7 @@ aarch64_classify_address (struct aarch64_address_info *info,

>  	     ldr/str instructions (only big endian will get here).  */

>  	  if (mode == CImode)

>  	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)

> -		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)

> +		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, offset + 32)

>  			|| offset_12bit_unsigned_scaled_p (V16QImode,

>  							   offset + 32)));

>  

> @@ -5877,7 +5877,7 @@ aarch64_classify_address (struct aarch64_address_info *info,

>  		     || known_eq (GET_MODE_SIZE (mode), 16))

>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));

>  	  else

> -	    return (offset_9bit_signed_unscaled_p (mode, offset)

> +	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)

>  		    || offset_12bit_unsigned_scaled_p (mode, offset));

>  	}

>  

> @@ -5930,7 +5930,7 @@ aarch64_classify_address (struct aarch64_address_info *info,

>  	   */

>  	  if (mode == TImode || mode == TFmode)

>  	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)

> -		    && offset_9bit_signed_unscaled_p (mode, offset));

> +		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));

>  

>  	  if (load_store_pair_p)

>  	    return ((known_eq (GET_MODE_SIZE (mode), 4)

> @@ -5938,7 +5938,7 @@ aarch64_classify_address (struct aarch64_address_info *info,

>  		     || known_eq (GET_MODE_SIZE (mode), 16))

>  		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));

>  	  else

> -	    return offset_9bit_signed_unscaled_p (mode, offset);

> +	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);

>  	}

>        return false;

>  

> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md

> index cd8c44e20ab0900481373193ec4b7fda86aadfe5..95d96261dcb1173088ff9debd6cb5aae3da010aa 100644

> --- a/gcc/config/aarch64/atomics.md

> +++ b/gcc/config/aarch64/atomics.md

> @@ -481,9 +481,9 @@

>  )

>  

>  (define_insn "atomic_store<mode>"

> -  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")

> +  [(set (match_operand:ALLI 0 "aarch64_sync_or_stlur_memory_operand" "=Q,Ust")

>      (unspec_volatile:ALLI

> -      [(match_operand:ALLI 1 "general_operand" "rZ")

> +      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")

>         (match_operand:SI 2 "const_int_operand")]			;; model

>        UNSPECV_STL))]

>    ""

> @@ -491,11 +491,14 @@

>      enum memmodel model = memmodel_from_int (INTVAL (operands[2]));

>      if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))

>        return "str<atomic_sfx>\t%<w>1, %0";

> -    else

> +    else if (which_alternative == 0)

>        return "stlr<atomic_sfx>\t%<w>1, %0";

> +    else

> +      return "stlur<atomic_sfx>\t%<w>1, %0";

>    }

>  )

>  

> +

>  (define_insn "aarch64_load_exclusive<mode>"

>    [(set (match_operand:SI 0 "register_operand" "=r")

>      (zero_extend:SI

> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md

> index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..2b2192d8d0a15ef7157915b2c045e953067bce63 100644

> --- a/gcc/config/aarch64/constraints.md

> +++ b/gcc/config/aarch64/constraints.md

> @@ -218,6 +218,13 @@

>   (and (match_code "mem")

>        (match_test "REG_P (XEXP (op, 0))")))

>  

> +;; STLUR instruction constraint requires Armv8.4

> +(define_special_memory_constraint "Ust"

> + "@internal

> + A memory address suitable for use with an stlur instruction."

> +  (and (match_operand 0 "aarch64_sync_or_stlur_memory_operand")

> +       (match_test "TARGET_ARMV8_4")))

> +

>  (define_memory_constraint "Ump"

>    "@internal

>    A memory address suitable for a load/store pair operation."

> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md

> index d8f377b9603e76a29dd92f95e9905121eaf7b800..1df2f914ebc87841671c0ebb097d4603e3cecd41 100644

> --- a/gcc/config/aarch64/predicates.md

> +++ b/gcc/config/aarch64/predicates.md

> @@ -359,6 +359,33 @@

>    (and (match_operand 0 "memory_operand")

>         (match_code "reg" "0")))

>  

> +;; True if the operand is memory reference valid for one of a str or stlur

> +;; operation.

> +(define_predicate "aarch64_sync_or_stlur_memory_operand"

> +  (ior (match_operand 0 "aarch64_sync_memory_operand")

> +       (and (match_operand 0 "memory_operand")

> +	    (match_code "plus" "0")

> +	    (match_code "reg" "00")

> +	    (match_code "const_int" "01")))

> +{

> +  if (aarch64_sync_memory_operand (op, mode))

> +    return true;

> +

> +  if (!TARGET_ARMV8_4)

> +    return false;

> +

> +  rtx mem_op = XEXP (op, 0);

> +  rtx plus_op0 = XEXP (mem_op, 0);

> +  rtx plus_op1 = XEXP (mem_op, 1);

> +

> +  if (GET_MODE (plus_op0) != DImode)

> +    return false;

> +

> +  poly_int64 offset;

> +  poly_int_rtx_p (plus_op1, &offset);

> +  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);

> +})

> +

>  ;; Predicates for parallel expanders based on mode.

>  (define_special_predicate "vect_par_cnst_hi_half"

>    (match_code "parallel")

> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c

> new file mode 100644

> index 0000000000000000000000000000000000000000..0ffc88029bbfe748cd0fc3b6ff4fdd47373b0bb5

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c

> @@ -0,0 +1,72 @@

> +/* { dg-do compile } */

> +/* { dg-options "-march=armv8.4-a -O2" } */

> +

> +#include <stdlib.h>

> +#include <stdatomic.h>

> +#include <stdint.h>

> +

> +#define STORE_TESTS(size) \

> +  void \

> +  foo##size () \

> +{ \

> +  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \

> +  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \

> +  atomic_store_explicit (atomic_vals, 2, memory_order_release); \

> +  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \

> +  atomic_store ((atomic_vals + 2), 2); \

> +  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \

> +}

> +

> +STORE_TESTS (8);

> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */

> +

> +STORE_TESTS (16);

> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */

> +

> +STORE_TESTS (32);

> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */

> +

> +STORE_TESTS (64);

> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */

> +/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */

> +

> +void

> +foo_toolarge_offset ()

> +{

> +  int64_t *atomic_vals = calloc (4, sizeof (int64_t));

> +  /* 9bit signed unscaled immediate => largest representable value +255.

> +     32 * 8 == 256  */

> +  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);

> +}

> +

> +void

> +foo_negative (int8_t *atomic_vals)

> +{

> +  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);

> +}

> +/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 } } */

> +

> +#pragma GCC target ("arch=armv8.3-a")

> +void

> +foo_older_arch ()

> +{

> +  int64_t *atomic_vals = calloc (4, sizeof (int64_t));

> +  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);

> +}

> +

> +/* Three times, one for each of the three above functions.  */

> +/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 3 } } */

>
Matthew Malcomson Aug. 6, 2018, 4:36 p.m. | #3
On 03/08/18 13:18, Sudakshina Das wrote:
>

> Thank you for doing this. I am not a maintainer but I have a few nits on

> this patch:

>

Thanks for the suggestions Sudi: nits have been removed, and new patch 
attached.

Have also removed the reference to "stlur" in names as Richard suggested.

New changelog:

gcc/
2018-08-06  Matthew Malcomson  <matthew.malcomson@arm.com>

     * config/aarch64/aarch64-protos.h
     (aarch64_offset_9bit_signed_unscaled_p): New declaration.
     * config/aarch64/aarch64.c
     (aarch64_offset_9bit_signed_unscaled_p): Rename from
     offset_9bit_signed_unscaled_p.
     * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.
     * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
     and use stlur.
     * config/aarch64/constraints.md (Ust): New constraint.
     * config/aarch64/predicates.md.
     (aarch64_sync_or_offset_memory_operand): New predicate.
     (aarch64_armv84_offset_memory_operand): New predicate.

gcc/testsuite/
2018-08-06  Matthew Malcomson  <matthew.malcomson@arm.com>

     * gcc.target/aarch64/atomic-store.c: New.


Regards,
Matthew
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..7a6254e46893fb36dc2ae57e7cfe78af67fb0e49 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -237,6 +237,9 @@ extern unsigned aarch64_architecture_version;
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
 
+/* ARMv8.4-A features.  */
+#define TARGET_ARMV8_4	(AARCH64_ISA_V8_4)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 13b5448aca88555222481f0955237b6fdcbb38b9..b9f3759ee67bbcf8a536cb5c334162922ae4cfd0 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4470,9 +4470,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
 
 /* Return true if OFFSET is a signed 9-bit value.  */
 
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
-			       poly_int64 offset)
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+				       poly_int64 offset)
 {
   HOST_WIDE_INT const_offset;
   return (offset.is_constant (&const_offset)
@@ -5747,7 +5747,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     instruction memory accesses.  */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-		    && (offset_9bit_signed_unscaled_p (mode, offset)
+		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
 
 	  /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5761,7 +5761,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     ldr/str instructions (only big endian will get here).  */
 	  if (mode == CImode)
 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
+							       offset + 32)
 			|| offset_12bit_unsigned_scaled_p (V16QImode,
 							   offset + 32)));
 
@@ -5801,7 +5802,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return (offset_9bit_signed_unscaled_p (mode, offset)
+	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 		    || offset_12bit_unsigned_scaled_p (mode, offset));
 	}
 
@@ -5854,7 +5855,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	   */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-		    && offset_9bit_signed_unscaled_p (mode, offset));
+		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5862,7 +5863,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return offset_9bit_signed_unscaled_p (mode, offset);
+	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
 	}
       return false;
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..41b9845db00fccb3781d91cb3b95680b5c51eb11 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_sync_or_offset_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_STL))]
   ""
@@ -491,8 +491,10 @@
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
 )
 
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..97e4aa6be606c7134efa433489615767bd847b5e 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,11 @@
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address with 9bit unscaled offset for an Armv8.4-a instruction."
+  (match_operand 0 "aarch64_armv84_offset_memory_operand"))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index d8f377b9603e76a29dd92f95e9905121eaf7b800..04f2aee3ed891a678ee471d59f34ad085eb9fed9 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,34 @@
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+(define_predicate "aarch64_armv84_offset_memory_operand"
+ (and (match_operand 0 "memory_operand")
+      (match_code "plus" "0")
+      (match_code "reg" "00")
+      (match_code "const_int" "01"))
+{
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  rtx mem_op = XEXP (op, 0);
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  poly_int_rtx_p (plus_op1, &offset);
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+
+;; True if the operand is memory reference valid for one of a str or stlur
+;; operation.
+(define_predicate "aarch64_sync_or_offset_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (match_operand 0 "aarch64_armv84_offset_memory_operand")))
+
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc88029bbfe748cd0fc3b6ff4fdd47373b0bb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size () \
+{ \
+  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  /* 9bit signed unscaled immediate => largest representable value +255.
+     32 * 8 == 256  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 3 } } */
Matthew Malcomson Aug. 7, 2018, 3:46 p.m. | #4
Hello everyone,

This is an updated patch where I've made what was a predicate into a 
pure C function based on some feedback outside the mailing list.

Ok for trunk?

Matthew

gcc/
2018-08-07  Matthew Malcomson  <matthew.malcomson@arm.com>

         * config/aarch64/aarch64-protos.h
         (aarch64_armv8_4_offset_memory_operand): New declaration.
         * config/aarch64/aarch64.c
         (aarch64_armv8_4_offset_memory_operand): New.
         * config/aarch64/aarch64.h (TARGET_ARMV8_4): Add feature macro.
         * config/aarch64/atomics.md (atomic_store<mode>): Allow offset
         and use stlur.
         * config/aarch64/constraints.md (Ust): New constraint.
         * config/aarch64/predicates.md.
         (aarch64_sync_or_offset_memory_operand): New predicate.

gcc/testsuite/
2018-08-07  Matthew Malcomson  <matthew.malcomson@arm.com>

     * gcc.target/aarch64/atomic-store.c: New.



###############     Attachment also inlined for ease of reply 
###############


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
ef95fc829b83886e2ff00e4664e31af916e99b0c..8a1c1eac75ad486777804cec9c313c49e129cc4d 
100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, 
rtx, rtx, rtx, rtx);
  bool aarch64_mov_operand_p (rtx, machine_mode);
  rtx aarch64_reverse_mask (machine_mode, unsigned int);
  bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_armv8_4_offset_memory_operand (rtx, machine_mode);
  char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
  char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
  char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 
100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -237,6 +237,9 @@ extern unsigned aarch64_architecture_version;
  /* ARMv8.3-A features.  */
  #define TARGET_ARMV8_3    (AARCH64_ISA_V8_3)

+/* ARMv8.4-A features.  */
+#define TARGET_ARMV8_4    (AARCH64_ISA_V8_4)
+
  /* Make sure this is always defined so we don't have to check for ifdefs
     but rather use normal ifs.  */
  #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
13b5448aca88555222481f0955237b6fdcbb38b9..607c4f8fc4786857db8f4c2848df18035ef42495 
100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4501,6 +4501,38 @@ offset_12bit_unsigned_scaled_p (machine_mode 
mode, poly_int64 offset)
        && IN_RANGE (multiple, 0, 4095));
  }

+/* Return true if the rtx describes a memory operand consisting of a DImode
+   register offset with a 9 bit signed unscaled constant and we're 
targeting
+   Armv8.4.
+   This function created to test for a case where the STLUR instruction 
will be
+   used.  */
+bool
+aarch64_armv8_4_offset_memory_operand (rtx op, machine_mode mode)
+{
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  if (!MEM_P (op))
+    return false;
+  rtx mem_op = XEXP (op, 0);
+
+  if (GET_CODE (mem_op) != PLUS)
+    return false;
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  /* STLUR instruction requires DImode register.  */
+  if (GET_MODE (plus_op0) != DImode
+      || !REG_P (plus_op0))
+    return false;
+
+  poly_int64 offset;
+  if (!poly_int_rtx_p (plus_op1, &offset))
+    return false;
+
+  return offset_9bit_signed_unscaled_p (mode, offset);
+}
+
  /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */

  static sbitmap
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
36c06756a1f94cadae097b3aad654fbeba1cf2f3..41b9845db00fccb3781d91cb3b95680b5c51eb11 
100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@
  )

  (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_sync_or_offset_memory_operand" 
"=Q,Ust")
      (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
         (match_operand:SI 2 "const_int_operand")]            ;; model
        UNSPECV_STL))]
    ""
@@ -491,8 +491,10 @@
      enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
      if (is_mm_relaxed (model) || is_mm_consume (model) || 
is_mm_acquire (model))
        return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
        return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
    }
  )

diff --git a/gcc/config/aarch64/constraints.md 
b/gcc/config/aarch64/constraints.md
index 
72cacdabdac52dcb40b480f7a5bfbf4997c742d8..40cc2f0143221aa2cd9ee0e5abb79640467ab03e 
100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,11 @@
   (and (match_code "mem")
        (match_test "REG_P (XEXP (op, 0))")))

+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address with 9bit unscaled offset for an Armv8.4-a instruction."
+  (match_test "aarch64_armv8_4_offset_memory_operand (op, mode)"))
+
  (define_memory_constraint "Ump"
    "@internal
    A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md 
b/gcc/config/aarch64/predicates.md
index 
d8f377b9603e76a29dd92f95e9905121eaf7b800..c36ca9f1c2dfdaec0f7bf0a363294001828a32d0 
100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,14 @@
    (and (match_operand 0 "memory_operand")
         (match_code "reg" "0")))

+;; True if the operand is memory reference valid for one of a stlr or stlur
+;; operation.
+(define_predicate "aarch64_sync_or_offset_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (and (match_operand 0 "memory_operand")
+        (match_test "aarch64_armv8_4_offset_memory_operand (op, mode)"))))
+
+
  ;; Predicates for parallel expanders based on mode.
  (define_special_predicate "vect_par_cnst_hi_half"
    (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c 
b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 
0000000000000000000000000000000000000000..7a9a9ff854a670977f307358e80ff60b8b329037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size () \
+{ \
+  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 
3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 
4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 
6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 
8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 
12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 
16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 
24\\\]" 1 } } */
+
+void
+foo_toolarge_offset ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  /* 9bit signed unscaled immediate =>
+    largest representable value +255.
+    smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 
-2\\\]" 1 } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, 
\\\[x\[0-9\]+\\\]" 4 } } */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..8a1c1eac75ad486777804cec9c313c49e129cc4d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_armv8_4_offset_memory_operand (rtx, machine_mode);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -237,6 +237,9 @@ extern unsigned aarch64_architecture_version;
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
 
+/* ARMv8.4-A features.  */
+#define TARGET_ARMV8_4	(AARCH64_ISA_V8_4)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 13b5448aca88555222481f0955237b6fdcbb38b9..607c4f8fc4786857db8f4c2848df18035ef42495 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4501,6 +4501,38 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
 	  && IN_RANGE (multiple, 0, 4095));
 }
 
+/* Return true if the rtx describes a memory operand consisting of a DImode
+   register offset with a 9 bit signed unscaled constant and we're targeting
+   Armv8.4.
+   This function created to test for a case where the STLUR instruction will be
+   used.  */
+bool
+aarch64_armv8_4_offset_memory_operand (rtx op, machine_mode mode)
+{
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  if (!MEM_P (op))
+    return false;
+  rtx mem_op = XEXP (op, 0);
+
+  if (GET_CODE (mem_op) != PLUS)
+    return false;
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  /* STLUR instruction requires DImode register.  */
+  if (GET_MODE (plus_op0) != DImode
+      || !REG_P (plus_op0))
+    return false;
+
+  poly_int64 offset;
+  if (!poly_int_rtx_p (plus_op1, &offset))
+    return false;
+
+  return offset_9bit_signed_unscaled_p (mode, offset);
+}
+
 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
 
 static sbitmap
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36c06756a1f94cadae097b3aad654fbeba1cf2f3..41b9845db00fccb3781d91cb3b95680b5c51eb11 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_sync_or_offset_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_STL))]
   ""
@@ -491,8 +491,10 @@
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
 )
 
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..40cc2f0143221aa2cd9ee0e5abb79640467ab03e 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,11 @@
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address with 9bit unscaled offset for an Armv8.4-a instruction."
+  (match_test "aarch64_armv8_4_offset_memory_operand (op, mode)"))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index d8f377b9603e76a29dd92f95e9905121eaf7b800..c36ca9f1c2dfdaec0f7bf0a363294001828a32d0 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,14 @@
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+;; True if the operand is memory reference valid for one of a stlr or stlur
+;; operation.
+(define_predicate "aarch64_sync_or_offset_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (and (match_operand 0 "memory_operand")
+	    (match_test "aarch64_armv8_4_offset_memory_operand (op, mode)"))))
+
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 0000000000000000000000000000000000000000..7a9a9ff854a670977f307358e80ff60b8b329037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size () \
+{ \
+  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  /* 9bit signed unscaled immediate =>
+	largest representable value +255.
+	smallest representable value -256.  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+  atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index af5db9c595385f7586692258f750b6aceb3ed9c8..630a75bf776fcdc374aa9ffa4bb020fea3719320 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -393,6 +393,7 @@  void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
 char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c1218503bab19323eee1cca8b7e4bea8fbfcf573..328512e11f4230e24223bc51e55bdca8b31f6a20 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -237,6 +237,9 @@  extern unsigned aarch64_architecture_version;
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
 
+/* ARMv8.4-A features.  */
+#define TARGET_ARMV8_4	(AARCH64_ISA_V8_4)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fa01475aa9ee579b6a3b2526295b622157120660..8560150d552b4f830335ccd420a0749f01d4bb6a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4546,8 +4546,8 @@  aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
 
 /* Return true if OFFSET is a signed 9-bit value.  */
 
-static inline bool
-offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
+bool
+aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
 			       poly_int64 offset)
 {
   HOST_WIDE_INT const_offset;
@@ -5823,7 +5823,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	     instruction memory accesses.  */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
-		    && (offset_9bit_signed_unscaled_p (mode, offset)
+		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
 
 	  /* A 7bit offset check because OImode will emit a ldp/stp
@@ -5837,7 +5837,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	     ldr/str instructions (only big endian will get here).  */
 	  if (mode == CImode)
 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
-		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
+		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
 			|| offset_12bit_unsigned_scaled_p (V16QImode,
 							   offset + 32)));
 
@@ -5877,7 +5877,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return (offset_9bit_signed_unscaled_p (mode, offset)
+	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 		    || offset_12bit_unsigned_scaled_p (mode, offset));
 	}
 
@@ -5930,7 +5930,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 	   */
 	  if (mode == TImode || mode == TFmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
-		    && offset_9bit_signed_unscaled_p (mode, offset));
+		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
 	  if (load_store_pair_p)
 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
@@ -5938,7 +5938,7 @@  aarch64_classify_address (struct aarch64_address_info *info,
 		     || known_eq (GET_MODE_SIZE (mode), 16))
 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
 	  else
-	    return offset_9bit_signed_unscaled_p (mode, offset);
+	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
 	}
       return false;
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index cd8c44e20ab0900481373193ec4b7fda86aadfe5..95d96261dcb1173088ff9debd6cb5aae3da010aa 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -481,9 +481,9 @@ 
 )
 
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
+  [(set (match_operand:ALLI 0 "aarch64_sync_or_stlur_memory_operand" "=Q,Ust")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "general_operand" "rZ")
+      [(match_operand:ALLI 1 "general_operand" "rZ,rZ")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_STL))]
   ""
@@ -491,11 +491,14 @@ 
     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
     if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
       return "str<atomic_sfx>\t%<w>1, %0";
-    else
+    else if (which_alternative == 0)
       return "stlr<atomic_sfx>\t%<w>1, %0";
+    else
+      return "stlur<atomic_sfx>\t%<w>1, %0";
   }
 )
 
+
 (define_insn "aarch64_load_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=r")
     (zero_extend:SI
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 72cacdabdac52dcb40b480f7a5bfbf4997c742d8..2b2192d8d0a15ef7157915b2c045e953067bce63 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -218,6 +218,13 @@ 
  (and (match_code "mem")
       (match_test "REG_P (XEXP (op, 0))")))
 
+;; STLUR instruction constraint requires Armv8.4
+(define_special_memory_constraint "Ust"
+ "@internal
+ A memory address suitable for use with an stlur instruction."
+  (and (match_operand 0 "aarch64_sync_or_stlur_memory_operand")
+       (match_test "TARGET_ARMV8_4")))
+
 (define_memory_constraint "Ump"
   "@internal
   A memory address suitable for a load/store pair operation."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index d8f377b9603e76a29dd92f95e9905121eaf7b800..1df2f914ebc87841671c0ebb097d4603e3cecd41 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -359,6 +359,33 @@ 
   (and (match_operand 0 "memory_operand")
        (match_code "reg" "0")))
 
+;; True if the operand is memory reference valid for one of a str or stlur
+;; operation.
+(define_predicate "aarch64_sync_or_stlur_memory_operand"
+  (ior (match_operand 0 "aarch64_sync_memory_operand")
+       (and (match_operand 0 "memory_operand")
+	    (match_code "plus" "0")
+	    (match_code "reg" "00")
+	    (match_code "const_int" "01")))
+{
+  if (aarch64_sync_memory_operand (op, mode))
+    return true;
+
+  if (!TARGET_ARMV8_4)
+    return false;
+
+  rtx mem_op = XEXP (op, 0);
+  rtx plus_op0 = XEXP (mem_op, 0);
+  rtx plus_op1 = XEXP (mem_op, 1);
+
+  if (GET_MODE (plus_op0) != DImode)
+    return false;
+
+  poly_int64 offset;
+  poly_int_rtx_p (plus_op1, &offset);
+  return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
+})
+
 ;; Predicates for parallel expanders based on mode.
 (define_special_predicate "vect_par_cnst_hi_half"
   (match_code "parallel")
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-store.c b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc88029bbfe748cd0fc3b6ff4fdd47373b0bb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-store.c
@@ -0,0 +1,72 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a -O2" } */
+
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stdint.h>
+
+#define STORE_TESTS(size) \
+  void \
+  foo##size () \
+{ \
+  int##size##_t *atomic_vals = calloc (4, sizeof (int##size##_t)); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
+  atomic_store_explicit (atomic_vals, 2, memory_order_release); \
+  atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
+  atomic_store ((atomic_vals + 2), 2); \
+  atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
+}
+
+STORE_TESTS (8);
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
+
+STORE_TESTS (16);
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
+
+STORE_TESTS (32);
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
+
+STORE_TESTS (64);
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
+
+void
+foo_toolarge_offset ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  /* 9bit signed unscaled immediate => largest representable value +255.
+     32 * 8 == 256  */
+  atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
+}
+
+void
+foo_negative (int8_t *atomic_vals)
+{
+  atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
+}
+/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 } } */
+
+#pragma GCC target ("arch=armv8.3-a")
+void
+foo_older_arch ()
+{
+  int64_t *atomic_vals = calloc (4, sizeof (int64_t));
+  atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
+}
+
+/* Three times, one for each of the three above functions.  */
+/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 3 } } */