[AARCH64] Neon vld1_*_x3, vst1_*_x2 and vst1_*_x3 intrinsics

Message ID CAAdirjxgB2dzBN973aesZCBVsv-N20dkrzC8Yoer0Jtb+zZ5ZA@mail.gmail.com
State New
Headers show
Series
  • [AARCH64] Neon vld1_*_x3, vst1_*_x2 and vst1_*_x3 intrinsics
Related show

Commit Message

Sameera Deshpande Dec. 14, 2017, 8:29 a.m.
Hi!

Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and
vst1_*_x3 intrinsics as defined by Neon document.

Ok for trunk?

- Thanks and regards,
  Sameera D.

gcc/Changelog:

2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>


        * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.
        (st1x2): Likewise.
        (st1x3): Likewise.
        * config/aarch64/aarch64-simd.md
(aarch64_ld1x3<VALLDIF:mode>): New pattern.
        (aarch64_ld1_x3_<mode>): Likewise
        (aarch64_st1x2<VALLDIF:mode>): Likewise
        (aarch64_st1_x2_<mode>): Likewise
        (aarch64_st1x3<VALLDIF:mode>): Likewise
        (aarch64_st1_x3_<mode>): Likewise
        * config/aarch64/arm_neon.h (vld1_u8_x3): New function.
        (vld1_s8_x3): Likewise.
        (vld1_u16_x3): Likewise.
        (vld1_s16_x3): Likewise.
        (vld1_u32_x3): Likewise.
        (vld1_s32_x3): Likewise.
        (vld1_u64_x3): Likewise.
        (vld1_s64_x3): Likewise.
        (vld1_fp16_x3): Likewise.
        (vld1_f32_x3): Likewise.
        (vld1_f64_x3): Likewise.
        (vld1_p8_x3): Likewise.
        (vld1_p16_x3): Likewise.
        (vld1_p64_x3): Likewise.
        (vld1q_u8_x3): Likewise.
        (vld1q_s8_x3): Likewise.
        (vld1q_u16_x3): Likewise.
        (vld1q_s16_x3): Likewise.
        (vld1q_u32_x3): Likewise.
        (vld1q_s32_x3): Likewise.
        (vld1q_u64_x3): Likewise.
        (vld1q_s64_x3): Likewise.
        (vld1q_f16_x3): Likewise.
        (vld1q_f32_x3): Likewise.
        (vld1q_f64_x3): Likewise.
        (vld1q_p8_x3): Likewise.
        (vld1q_p16_x3): Likewise.
        (vld1q_p64_x3): Likewise.
        (vst1_s64_x2): Likewise.
        (vst1_u64_x2): Likewise.
        (vst1_f64_x2): Likewise.
        (vst1_s8_x2): Likewise.
        (vst1_p8_x2): Likewise.
        (vst1_s16_x2): Likewise.
        (vst1_p16_x2): Likewise.
        (vst1_s32_x2): Likewise.
        (vst1_u8_x2): Likewise.
        (vst1_u16_x2): Likewise.
        (vst1_u32_x2): Likewise.
        (vst1_f16_x2): Likewise.
        (vst1_f32_x2): Likewise.
        (vst1_p64_x2): Likewise.
        (vst1q_s8_x2): Likewise.
        (vst1q_p8_x2): Likewise.
        (vst1q_s16_x2): Likewise.
        (vst1q_p16_x2): Likewise.
        (vst1q_s32_x2): Likewise.
        (vst1q_s64_x2): Likewise.
        (vst1q_u8_x2): Likewise.
        (vst1q_u16_x2): Likewise.
        (vst1q_u32_x2): Likewise.
        (vst1q_u64_x2): Likewise.
        (vst1q_f16_x2): Likewise.
        (vst1q_f32_x2): Likewise.
        (vst1q_f64_x2): Likewise.
        (vst1q_p64_x2): Likewise.
        (vst1_s64_x3): Likewise.
        (vst1_u64_x3): Likewise.
        (vst1_f64_x3): Likewise.
        (vst1_s8_x3): Likewise.
        (vst1_p8_x3): Likewise.
        (vst1_s16_x3): Likewise.
        (vst1_p16_x3): Likewise.
        (vst1_s32_x3): Likewise.
        (vst1_u8_x3): Likewise.
        (vst1_u16_x3): Likewise.
        (vst1_u32_x3): Likewise.
        (vst1_f16_x3): Likewise.
        (vst1_f32_x3): Likewise.
        (vst1_p64_x3): Likewise.
        (vst1q_s8_x3): Likewise.
        (vst1q_p8_x3): Likewise.
        (vst1q_s16_x3): Likewise.
        (vst1q_p16_x3): Likewise.
        (vst1q_s32_x3): Likewise.
        (vst1q_s64_x3): Likewise.
        (vst1q_u8_x3): Likewise.
        (vst1q_u16_x3): Likewise.
        (vst1q_u32_x3): Likewise.
        (vst1q_u64_x3): Likewise.
        (vst1q_f16_x3): Likewise.
        (vst1q_f32_x3): Likewise.
        (vst1q_f64_x3): Likewise.
        (vst1q_p64_x3): Likewise.

Comments

Christophe Lyon Dec. 14, 2017, 4:47 p.m. | #1
2017-12-14 9:29 GMT+01:00 Sameera Deshpande <sameera.deshpande@linaro.org>:
> Hi!

>

> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

> vst1_*_x3 intrinsics as defined by Neon document.

>

> Ok for trunk?

>

> - Thanks and regards,

>   Sameera D.

>

> gcc/Changelog:

>

> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>

>

>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>         (st1x2): Likewise.

>         (st1x3): Likewise.

>         * config/aarch64/aarch64-simd.md

> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>         (aarch64_ld1_x3_<mode>): Likewise

>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>         (aarch64_st1_x2_<mode>): Likewise

>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>         (aarch64_st1_x3_<mode>): Likewise

>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>         (vld1_s8_x3): Likewise.

>         (vld1_u16_x3): Likewise.

>         (vld1_s16_x3): Likewise.

>         (vld1_u32_x3): Likewise.

>         (vld1_s32_x3): Likewise.

>         (vld1_u64_x3): Likewise.

>         (vld1_s64_x3): Likewise.

>         (vld1_fp16_x3): Likewise.

>         (vld1_f32_x3): Likewise.

>         (vld1_f64_x3): Likewise.

>         (vld1_p8_x3): Likewise.

>         (vld1_p16_x3): Likewise.

>         (vld1_p64_x3): Likewise.

>         (vld1q_u8_x3): Likewise.

>         (vld1q_s8_x3): Likewise.

>         (vld1q_u16_x3): Likewise.

>         (vld1q_s16_x3): Likewise.

>         (vld1q_u32_x3): Likewise.

>         (vld1q_s32_x3): Likewise.

>         (vld1q_u64_x3): Likewise.

>         (vld1q_s64_x3): Likewise.

>         (vld1q_f16_x3): Likewise.

>         (vld1q_f32_x3): Likewise.

>         (vld1q_f64_x3): Likewise.

>         (vld1q_p8_x3): Likewise.

>         (vld1q_p16_x3): Likewise.

>         (vld1q_p64_x3): Likewise.

>         (vst1_s64_x2): Likewise.

>         (vst1_u64_x2): Likewise.

>         (vst1_f64_x2): Likewise.

>         (vst1_s8_x2): Likewise.

>         (vst1_p8_x2): Likewise.

>         (vst1_s16_x2): Likewise.

>         (vst1_p16_x2): Likewise.

>         (vst1_s32_x2): Likewise.

>         (vst1_u8_x2): Likewise.

>         (vst1_u16_x2): Likewise.

>         (vst1_u32_x2): Likewise.

>         (vst1_f16_x2): Likewise.

>         (vst1_f32_x2): Likewise.

>         (vst1_p64_x2): Likewise.

>         (vst1q_s8_x2): Likewise.

>         (vst1q_p8_x2): Likewise.

>         (vst1q_s16_x2): Likewise.

>         (vst1q_p16_x2): Likewise.

>         (vst1q_s32_x2): Likewise.

>         (vst1q_s64_x2): Likewise.

>         (vst1q_u8_x2): Likewise.

>         (vst1q_u16_x2): Likewise.

>         (vst1q_u32_x2): Likewise.

>         (vst1q_u64_x2): Likewise.

>         (vst1q_f16_x2): Likewise.

>         (vst1q_f32_x2): Likewise.

>         (vst1q_f64_x2): Likewise.

>         (vst1q_p64_x2): Likewise.

>         (vst1_s64_x3): Likewise.

>         (vst1_u64_x3): Likewise.

>         (vst1_f64_x3): Likewise.

>         (vst1_s8_x3): Likewise.

>         (vst1_p8_x3): Likewise.

>         (vst1_s16_x3): Likewise.

>         (vst1_p16_x3): Likewise.

>         (vst1_s32_x3): Likewise.

>         (vst1_u8_x3): Likewise.

>         (vst1_u16_x3): Likewise.

>         (vst1_u32_x3): Likewise.

>         (vst1_f16_x3): Likewise.

>         (vst1_f32_x3): Likewise.

>         (vst1_p64_x3): Likewise.

>         (vst1q_s8_x3): Likewise.

>         (vst1q_p8_x3): Likewise.

>         (vst1q_s16_x3): Likewise.

>         (vst1q_p16_x3): Likewise.

>         (vst1q_s32_x3): Likewise.

>         (vst1q_s64_x3): Likewise.

>         (vst1q_u8_x3): Likewise.

>         (vst1q_u16_x3): Likewise.

>         (vst1q_u32_x3): Likewise.

>         (vst1q_u64_x3): Likewise.

>         (vst1q_f16_x3): Likewise.

>         (vst1q_f32_x3): Likewise.

>         (vst1q_f64_x3): Likewise.

>         (vst1q_p64_x3): Likewise.


Hi,
I'm not a maintainer, but I suspect you should add some tests.

Christophe
Sameera Deshpande April 6, 2018, 10:15 a.m. | #2
Hi Christophe,

Please find attached the updated patch with testcases.

Ok for trunk?

- Thanks and regards,
  Sameera D.

2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:
> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>> Hi!

>>

>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>> vst1_*_x3 intrinsics as defined by Neon document.

>>

>> Ok for trunk?

>>

>> - Thanks and regards,

>>   Sameera D.

>>

>> gcc/Changelog:

>>

>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>

>>

>>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>         (st1x2): Likewise.

>>         (st1x3): Likewise.

>>         * config/aarch64/aarch64-simd.md

>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>         (aarch64_ld1_x3_<mode>): Likewise

>>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>>         (aarch64_st1_x2_<mode>): Likewise

>>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>>         (aarch64_st1_x3_<mode>): Likewise

>>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>         (vld1_s8_x3): Likewise.

>>         (vld1_u16_x3): Likewise.

>>         (vld1_s16_x3): Likewise.

>>         (vld1_u32_x3): Likewise.

>>         (vld1_s32_x3): Likewise.

>>         (vld1_u64_x3): Likewise.

>>         (vld1_s64_x3): Likewise.

>>         (vld1_fp16_x3): Likewise.

>>         (vld1_f32_x3): Likewise.

>>         (vld1_f64_x3): Likewise.

>>         (vld1_p8_x3): Likewise.

>>         (vld1_p16_x3): Likewise.

>>         (vld1_p64_x3): Likewise.

>>         (vld1q_u8_x3): Likewise.

>>         (vld1q_s8_x3): Likewise.

>>         (vld1q_u16_x3): Likewise.

>>         (vld1q_s16_x3): Likewise.

>>         (vld1q_u32_x3): Likewise.

>>         (vld1q_s32_x3): Likewise.

>>         (vld1q_u64_x3): Likewise.

>>         (vld1q_s64_x3): Likewise.

>>         (vld1q_f16_x3): Likewise.

>>         (vld1q_f32_x3): Likewise.

>>         (vld1q_f64_x3): Likewise.

>>         (vld1q_p8_x3): Likewise.

>>         (vld1q_p16_x3): Likewise.

>>         (vld1q_p64_x3): Likewise.

>>         (vst1_s64_x2): Likewise.

>>         (vst1_u64_x2): Likewise.

>>         (vst1_f64_x2): Likewise.

>>         (vst1_s8_x2): Likewise.

>>         (vst1_p8_x2): Likewise.

>>         (vst1_s16_x2): Likewise.

>>         (vst1_p16_x2): Likewise.

>>         (vst1_s32_x2): Likewise.

>>         (vst1_u8_x2): Likewise.

>>         (vst1_u16_x2): Likewise.

>>         (vst1_u32_x2): Likewise.

>>         (vst1_f16_x2): Likewise.

>>         (vst1_f32_x2): Likewise.

>>         (vst1_p64_x2): Likewise.

>>         (vst1q_s8_x2): Likewise.

>>         (vst1q_p8_x2): Likewise.

>>         (vst1q_s16_x2): Likewise.

>>         (vst1q_p16_x2): Likewise.

>>         (vst1q_s32_x2): Likewise.

>>         (vst1q_s64_x2): Likewise.

>>         (vst1q_u8_x2): Likewise.

>>         (vst1q_u16_x2): Likewise.

>>         (vst1q_u32_x2): Likewise.

>>         (vst1q_u64_x2): Likewise.

>>         (vst1q_f16_x2): Likewise.

>>         (vst1q_f32_x2): Likewise.

>>         (vst1q_f64_x2): Likewise.

>>         (vst1q_p64_x2): Likewise.

>>         (vst1_s64_x3): Likewise.

>>         (vst1_u64_x3): Likewise.

>>         (vst1_f64_x3): Likewise.

>>         (vst1_s8_x3): Likewise.

>>         (vst1_p8_x3): Likewise.

>>         (vst1_s16_x3): Likewise.

>>         (vst1_p16_x3): Likewise.

>>         (vst1_s32_x3): Likewise.

>>         (vst1_u8_x3): Likewise.

>>         (vst1_u16_x3): Likewise.

>>         (vst1_u32_x3): Likewise.

>>         (vst1_f16_x3): Likewise.

>>         (vst1_f32_x3): Likewise.

>>         (vst1_p64_x3): Likewise.

>>         (vst1q_s8_x3): Likewise.

>>         (vst1q_p8_x3): Likewise.

>>         (vst1q_s16_x3): Likewise.

>>         (vst1q_p16_x3): Likewise.

>>         (vst1q_s32_x3): Likewise.

>>         (vst1q_s64_x3): Likewise.

>>         (vst1q_u8_x3): Likewise.

>>         (vst1q_u16_x3): Likewise.

>>         (vst1q_u32_x3): Likewise.

>>         (vst1q_u64_x3): Likewise.

>>         (vst1q_f16_x3): Likewise.

>>         (vst1q_f32_x3): Likewise.

>>         (vst1q_f64_x3): Likewise.

>>         (vst1q_p64_x3): Likewise.

>

> Hi,

> I'm not a maintainer, but I suspect you should add some tests.

>

> Christophe




-- 
- Thanks and regards,
  Sameera D.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f24..2fd072a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..e197a67 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5047,6 +5047,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI 
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") 
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29a..6ac7099 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 0000000..21bce41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,79 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vld##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors = vld1##SUFFIX##_x3 (data);		\
+  vst1##SUFFIX (temp, vectors.val[0]);		\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vld##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 0000000..d7fa879
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x2 ()			\
+{						\
+  BASE##_t data[ELTS * 2];			\
+  BASE##_t temp[ELTS * 2];			\
+  BASE##x##ELTS##x##2##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 2; i++)		\
+    data [i] = (BASE##_t) 2*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vst1##SUFFIX##_x2 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 2; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x2 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 0000000..b0df29c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vst1##SUFFIX##_x3 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
Christophe Lyon April 6, 2018, 7:55 p.m. | #3
Hi,

2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org>:
> Hi Christophe,

>

> Please find attached the updated patch with testcases.

>

> Ok for trunk?


Thanks for the update.

Since the new intrinsics are only available on aarch64, you want to
prevent the tests from running on arm.
Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.
There are several examples on how to do that in that directory.

I have also noticed that the tests fail at execution on aarch64_be.

I didn't look at the patch in details.

Christophe


>

> - Thanks and regards,

>   Sameera D.

>

> 2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:

>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>>> Hi!

>>>

>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>

>>> Ok for trunk?

>>>

>>> - Thanks and regards,

>>>   Sameera D.

>>>

>>> gcc/Changelog:

>>>

>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>

>>>

>>>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>         (st1x2): Likewise.

>>>         (st1x3): Likewise.

>>>         * config/aarch64/aarch64-simd.md

>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>         (aarch64_ld1_x3_<mode>): Likewise

>>>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>         (aarch64_st1_x2_<mode>): Likewise

>>>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>         (aarch64_st1_x3_<mode>): Likewise

>>>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>         (vld1_s8_x3): Likewise.

>>>         (vld1_u16_x3): Likewise.

>>>         (vld1_s16_x3): Likewise.

>>>         (vld1_u32_x3): Likewise.

>>>         (vld1_s32_x3): Likewise.

>>>         (vld1_u64_x3): Likewise.

>>>         (vld1_s64_x3): Likewise.

>>>         (vld1_fp16_x3): Likewise.

>>>         (vld1_f32_x3): Likewise.

>>>         (vld1_f64_x3): Likewise.

>>>         (vld1_p8_x3): Likewise.

>>>         (vld1_p16_x3): Likewise.

>>>         (vld1_p64_x3): Likewise.

>>>         (vld1q_u8_x3): Likewise.

>>>         (vld1q_s8_x3): Likewise.

>>>         (vld1q_u16_x3): Likewise.

>>>         (vld1q_s16_x3): Likewise.

>>>         (vld1q_u32_x3): Likewise.

>>>         (vld1q_s32_x3): Likewise.

>>>         (vld1q_u64_x3): Likewise.

>>>         (vld1q_s64_x3): Likewise.

>>>         (vld1q_f16_x3): Likewise.

>>>         (vld1q_f32_x3): Likewise.

>>>         (vld1q_f64_x3): Likewise.

>>>         (vld1q_p8_x3): Likewise.

>>>         (vld1q_p16_x3): Likewise.

>>>         (vld1q_p64_x3): Likewise.

>>>         (vst1_s64_x2): Likewise.

>>>         (vst1_u64_x2): Likewise.

>>>         (vst1_f64_x2): Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

patchname=armv8_2-fp16-scalar-2.patch3
refrev=259064
email_to=christophe.lyon@linaro.org

>>>         (vst1_s8_x2): Likewise.

>>>         (vst1_p8_x2): Likewise.

>>>         (vst1_s16_x2): Likewise.

>>>         (vst1_p16_x2): Likewise.

>>>         (vst1_s32_x2): Likewise.

>>>         (vst1_u8_x2): Likewise.

>>>         (vst1_u16_x2): Likewise.

>>>         (vst1_u32_x2): Likewise.

>>>         (vst1_f16_x2): Likewise.

>>>         (vst1_f32_x2): Likewise.

>>>         (vst1_p64_x2): Likewise.

>>>         (vst1q_s8_x2): Likewise.

>>>         (vst1q_p8_x2): Likewise.

>>>         (vst1q_s16_x2): Likewise.

>>>         (vst1q_p16_x2): Likewise.

>>>         (vst1q_s32_x2): Likewise.

>>>         (vst1q_s64_x2): Likewise.

>>>         (vst1q_u8_x2): Likewise.

>>>         (vst1q_u16_x2): Likewise.

>>>         (vst1q_u32_x2): Likewise.

>>>         (vst1q_u64_x2): Likewise.

>>>         (vst1q_f16_x2): Likewise.

>>>         (vst1q_f32_x2): Likewise.

>>>         (vst1q_f64_x2): Likewise.

>>>         (vst1q_p64_x2): Likewise.

>>>         (vst1_s64_x3): Likewise.

>>>         (vst1_u64_x3): Likewise.

>>>         (vst1_f64_x3): Likewise.

>>>         (vst1_s8_x3): Likewise.

>>>         (vst1_p8_x3): Likewise.

>>>         (vst1_s16_x3): Likewise.

>>>         (vst1_p16_x3): Likewise.

>>>         (vst1_s32_x3): Likewise.

>>>         (vst1_u8_x3): Likewise.

>>>         (vst1_u16_x3): Likewise.

>>>         (vst1_u32_x3): Likewise.

>>>         (vst1_f16_x3): Likewise.

>>>         (vst1_f32_x3): Likewise.

>>>         (vst1_p64_x3): Likewise.

>>>         (vst1q_s8_x3): Likewise.

>>>         (vst1q_p8_x3): Likewise.

>>>         (vst1q_s16_x3): Likewise.

>>>         (vst1q_p16_x3): Likewise.

>>>         (vst1q_s32_x3): Likewise.

>>>         (vst1q_s64_x3): Likewise.

>>>         (vst1q_u8_x3): Likewise.

>>>         (vst1q_u16_x3): Likewise.

>>>         (vst1q_u32_x3): Likewise.

>>>         (vst1q_u64_x3): Likewise.

>>>         (vst1q_f16_x3): Likewise.

>>>         (vst1q_f32_x3): Likewise.

>>>         (vst1q_f64_x3): Likewise.

>>>         (vst1q_p64_x3): Likewise.

>>

>> Hi,

>> I'm not a maintainer, but I suspect you should add some tests.

>>

>> Christophe

>

>

>

> --

> - Thanks and regards,

>   Sameera D.
Sameera Deshpande April 10, 2018, 10:20 a.m. | #4
On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> Hi,

>

> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>> Hi Christophe,

>>

>> Please find attached the updated patch with testcases.

>>

>> Ok for trunk?

>

> Thanks for the update.

>

> Since the new intrinsics are only available on aarch64, you want to

> prevent the tests from running on arm.

> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

> There are several examples on how to do that in that directory.

>

> I have also noticed that the tests fail at execution on aarch64_be.

>

> I didn't look at the patch in details.

>

> Christophe

>

>

>>

>> - Thanks and regards,

>>   Sameera D.

>>

>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:

>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>>>> Hi!

>>>>

>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>

>>>> Ok for trunk?

>>>>

>>>> - Thanks and regards,

>>>>   Sameera D.

>>>>

>>>> gcc/Changelog:

>>>>

>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>

>>>>

>>>>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>         (st1x2): Likewise.

>>>>         (st1x3): Likewise.

>>>>         * config/aarch64/aarch64-simd.md

>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>         (aarch64_ld1_x3_<mode>): Likewise

>>>>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>         (aarch64_st1_x2_<mode>): Likewise

>>>>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>         (aarch64_st1_x3_<mode>): Likewise

>>>>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>         (vld1_s8_x3): Likewise.

>>>>         (vld1_u16_x3): Likewise.

>>>>         (vld1_s16_x3): Likewise.

>>>>         (vld1_u32_x3): Likewise.

>>>>         (vld1_s32_x3): Likewise.

>>>>         (vld1_u64_x3): Likewise.

>>>>         (vld1_s64_x3): Likewise.

>>>>         (vld1_fp16_x3): Likewise.

>>>>         (vld1_f32_x3): Likewise.

>>>>         (vld1_f64_x3): Likewise.

>>>>         (vld1_p8_x3): Likewise.

>>>>         (vld1_p16_x3): Likewise.

>>>>         (vld1_p64_x3): Likewise.

>>>>         (vld1q_u8_x3): Likewise.

>>>>         (vld1q_s8_x3): Likewise.

>>>>         (vld1q_u16_x3): Likewise.

>>>>         (vld1q_s16_x3): Likewise.

>>>>         (vld1q_u32_x3): Likewise.

>>>>         (vld1q_s32_x3): Likewise.

>>>>         (vld1q_u64_x3): Likewise.

>>>>         (vld1q_s64_x3): Likewise.

>>>>         (vld1q_f16_x3): Likewise.

>>>>         (vld1q_f32_x3): Likewise.

>>>>         (vld1q_f64_x3): Likewise.

>>>>         (vld1q_p8_x3): Likewise.

>>>>         (vld1q_p16_x3): Likewise.

>>>>         (vld1q_p64_x3): Likewise.

>>>>         (vst1_s64_x2): Likewise.

>>>>         (vst1_u64_x2): Likewise.

>>>>         (vst1_f64_x2): Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

> patchname=armv8_2-fp16-scalar-2.patch3

> refrev=259064

> email_to=christophe.lyon@linaro.org

>

>>>>         (vst1_s8_x2): Likewise.

>>>>         (vst1_p8_x2): Likewise.

>>>>         (vst1_s16_x2): Likewise.

>>>>         (vst1_p16_x2): Likewise.

>>>>         (vst1_s32_x2): Likewise.

>>>>         (vst1_u8_x2): Likewise.

>>>>         (vst1_u16_x2): Likewise.

>>>>         (vst1_u32_x2): Likewise.

>>>>         (vst1_f16_x2): Likewise.

>>>>         (vst1_f32_x2): Likewise.

>>>>         (vst1_p64_x2): Likewise.

>>>>         (vst1q_s8_x2): Likewise.

>>>>         (vst1q_p8_x2): Likewise.

>>>>         (vst1q_s16_x2): Likewise.

>>>>         (vst1q_p16_x2): Likewise.

>>>>         (vst1q_s32_x2): Likewise.

>>>>         (vst1q_s64_x2): Likewise.

>>>>         (vst1q_u8_x2): Likewise.

>>>>         (vst1q_u16_x2): Likewise.

>>>>         (vst1q_u32_x2): Likewise.

>>>>         (vst1q_u64_x2): Likewise.

>>>>         (vst1q_f16_x2): Likewise.

>>>>         (vst1q_f32_x2): Likewise.

>>>>         (vst1q_f64_x2): Likewise.

>>>>         (vst1q_p64_x2): Likewise.

>>>>         (vst1_s64_x3): Likewise.

>>>>         (vst1_u64_x3): Likewise.

>>>>         (vst1_f64_x3): Likewise.

>>>>         (vst1_s8_x3): Likewise.

>>>>         (vst1_p8_x3): Likewise.

>>>>         (vst1_s16_x3): Likewise.

>>>>         (vst1_p16_x3): Likewise.

>>>>         (vst1_s32_x3): Likewise.

>>>>         (vst1_u8_x3): Likewise.

>>>>         (vst1_u16_x3): Likewise.

>>>>         (vst1_u32_x3): Likewise.

>>>>         (vst1_f16_x3): Likewise.

>>>>         (vst1_f32_x3): Likewise.

>>>>         (vst1_p64_x3): Likewise.

>>>>         (vst1q_s8_x3): Likewise.

>>>>         (vst1q_p8_x3): Likewise.

>>>>         (vst1q_s16_x3): Likewise.

>>>>         (vst1q_p16_x3): Likewise.

>>>>         (vst1q_s32_x3): Likewise.

>>>>         (vst1q_s64_x3): Likewise.

>>>>         (vst1q_u8_x3): Likewise.

>>>>         (vst1q_u16_x3): Likewise.

>>>>         (vst1q_u32_x3): Likewise.

>>>>         (vst1q_u64_x3): Likewise.

>>>>         (vst1q_f16_x3): Likewise.

>>>>         (vst1q_f32_x3): Likewise.

>>>>         (vst1q_f64_x3): Likewise.

>>>>         (vst1q_p64_x3): Likewise.

>>>

>>> Hi,

>>> I'm not a maintainer, but I suspect you should add some tests.

>>>

>>> Christophe

>>

>>

>>

>> --

>> - Thanks and regards,

>>   Sameera D.


Hi Christophe,

Please find attached the updated patch. Similar to the testcase
vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as
the intrinsics are not implemented yet. I have also added required
target to be little endian.

Ok for thrunk?

- Thanks and regards,
  Sameera D.

gcc/Changelog:

2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>


        * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.
        (st1x2): Likewise.
        (st1x3): Likewise.
        * config/aarch64/aarch64-simd.md
        (aarch64_ld1x3<VALLDIF:mode>): New pattern.
        (aarch64_ld1_x3_<mode>): Likewise
        (aarch64_st1x2<VALLDIF:mode>): Likewise
        (aarch64_st1_x2_<mode>): Likewise
        (aarch64_st1x3<VALLDIF:mode>): Likewise
        (aarch64_st1_x3_<mode>): Likewise
        * config/aarch64/arm_neon.h (vld1_u8_x3): New function.
        (vld1_s8_x3): Likewise.
        (vld1_u16_x3): Likewise.
        (vld1_s16_x3): Likewise.
        (vld1_u32_x3): Likewise.
        (vld1_s32_x3): Likewise.
        (vld1_u64_x3): Likewise.
        (vld1_s64_x3): Likewise.
        (vld1_f16_x3): Likewise.
        (vld1_f32_x3): Likewise.
        (vld1_f64_x3): Likewise.
        (vld1_p8_x3): Likewise.
        (vld1_p16_x3): Likewise.
        (vld1_p64_x3): Likewise.
        (vld1q_u8_x3): Likewise.
        (vld1q_s8_x3): Likewise.
        (vld1q_u16_x3): Likewise.
        (vld1q_s16_x3): Likewise.
        (vld1q_u32_x3): Likewise.
        (vld1q_s32_x3): Likewise.
        (vld1q_u64_x3): Likewise.
        (vld1q_s64_x3): Likewise.
        (vld1q_f16_x3): Likewise.
        (vld1q_f32_x3): Likewise.
        (vld1q_f64_x3): Likewise.
        (vld1q_p8_x3): Likewise.
        (vld1q_p16_x3): Likewise.
        (vld1q_p64_x3): Likewise.
        (vst1_s64_x2): Likewise.
        (vst1_u64_x2): Likewise.
        (vst1_f64_x2): Likewise.
        (vst1_s8_x2): Likewise.
        (vst1_p8_x2): Likewise.
        (vst1_s16_x2): Likewise.
        (vst1_p16_x2): Likewise.
        (vst1_s32_x2): Likewise.
        (vst1_u8_x2): Likewise.
        (vst1_u16_x2): Likewise.
        (vst1_u32_x2): Likewise.
        (vst1_f16_x2): Likewise.
        (vst1_f32_x2): Likewise.
        (vst1_p64_x2): Likewise.
        (vst1q_s8_x2): Likewise.
        (vst1q_p8_x2): Likewise.
        (vst1q_s16_x2): Likewise.
        (vst1q_p16_x2): Likewise.
        (vst1q_s32_x2): Likewise.
        (vst1q_s64_x2): Likewise.
        (vst1q_u8_x2): Likewise.
        (vst1q_u16_x2): Likewise.
        (vst1q_u32_x2): Likewise.
        (vst1q_u64_x2): Likewise.
        (vst1q_f16_x2): Likewise.
        (vst1q_f32_x2): Likewise.
        (vst1q_f64_x2): Likewise.
        (vst1q_p64_x2): Likewise.
        (vst1_s64_x3): Likewise.
        (vst1_u64_x3): Likewise.
        (vst1_f64_x3): Likewise.
        (vst1_s8_x3): Likewise.
        (vst1_p8_x3): Likewise.
        (vst1_s16_x3): Likewise.
        (vst1_p16_x3): Likewise.
        (vst1_s32_x3): Likewise.
        (vst1_u8_x3): Likewise.
        (vst1_u16_x3): Likewise.
        (vst1_u32_x3): Likewise.
        (vst1_f16_x3): Likewise.
        (vst1_f32_x3): Likewise.
        (vst1_p64_x3): Likewise.
        (vst1q_s8_x3): Likewise.
        (vst1q_p8_x3): Likewise.
        (vst1q_s16_x3): Likewise.
        (vst1q_p16_x3): Likewise.
        (vst1q_s32_x3): Likewise.
        (vst1q_s64_x3): Likewise.
        (vst1q_u8_x3): Likewise.
        (vst1q_u16_x3): Likewise.
        (vst1q_u32_x3): Likewise.
        (vst1q_u64_x3): Likewise.
        (vst1q_f16_x3): Likewise.
        (vst1q_f32_x3): Likewise.
        (vst1q_f64_x3): Likewise.
        (vst1q_p64_x3): Likewise.

gcc/testsuite/Changelog:

2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

        * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for
vld1x3 intrinsics for aarch64_little_endian.
        * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for
vst1x2 intrinsics for aarch64_little_endian.
        * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for
vst1x3 intrinsics for aarch64_little_endian.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f24..2fd072a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..e197a67 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5047,6 +5047,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI 
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") 
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29a..6ac7099 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 0000000..c37c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vld##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors = vld1##SUFFIX##_x3 (data);		\
+  vst1##SUFFIX (temp, vectors.val[0]);		\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j]) 			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vld##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 0000000..3b6797c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,80 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x2 ()			\
+{						\
+  BASE##_t data[ELTS * 2];			\
+  BASE##_t temp[ELTS * 2];			\
+  BASE##x##ELTS##x##2##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 2; i++)		\
+    data [i] = (BASE##_t) 2*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vst1##SUFFIX##_x2 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 2; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x2 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 0000000..1709115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,81 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vst1##SUFFIX##_x3 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
Sudakshina Das April 10, 2018, 2:37 p.m. | #5
Hi Sameera

On 10/04/18 11:20, Sameera Deshpande wrote:
> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org> wrote:

>> Hi,

>>

>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>>> Hi Christophe,

>>>

>>> Please find attached the updated patch with testcases.

>>>

>>> Ok for trunk?

>>

>> Thanks for the update.

>>

>> Since the new intrinsics are only available on aarch64, you want to

>> prevent the tests from running on arm.

>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

>> There are several examples on how to do that in that directory.

>>

>> I have also noticed that the tests fail at execution on aarch64_be.

>>

>> I didn't look at the patch in details.

>>

>> Christophe

>>

>>

>>>

>>> - Thanks and regards,

>>>    Sameera D.

>>>

>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:

>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

>>>>> Hi!

>>>>>

>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>>

>>>>> Ok for trunk?

>>>>>

>>>>> - Thanks and regards,

>>>>>    Sameera D.

>>>>>

>>>>> gcc/Changelog:

>>>>>

>>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>

>>>>>

>>>>>          * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>          (st1x2): Likewise.

>>>>>          (st1x3): Likewise.

>>>>>          * config/aarch64/aarch64-simd.md

>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>          (aarch64_ld1_x3_<mode>): Likewise

>>>>>          (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>          (aarch64_st1_x2_<mode>): Likewise

>>>>>          (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>          (aarch64_st1_x3_<mode>): Likewise

>>>>>          * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>          (vld1_s8_x3): Likewise.

>>>>>          (vld1_u16_x3): Likewise.

>>>>>          (vld1_s16_x3): Likewise.

>>>>>          (vld1_u32_x3): Likewise.

>>>>>          (vld1_s32_x3): Likewise.

>>>>>          (vld1_u64_x3): Likewise.

>>>>>          (vld1_s64_x3): Likewise.

>>>>>          (vld1_fp16_x3): Likewise.

>>>>>          (vld1_f32_x3): Likewise.

>>>>>          (vld1_f64_x3): Likewise.

>>>>>          (vld1_p8_x3): Likewise.

>>>>>          (vld1_p16_x3): Likewise.

>>>>>          (vld1_p64_x3): Likewise.

>>>>>          (vld1q_u8_x3): Likewise.

>>>>>          (vld1q_s8_x3): Likewise.

>>>>>          (vld1q_u16_x3): Likewise.

>>>>>          (vld1q_s16_x3): Likewise.

>>>>>          (vld1q_u32_x3): Likewise.

>>>>>          (vld1q_s32_x3): Likewise.

>>>>>          (vld1q_u64_x3): Likewise.

>>>>>          (vld1q_s64_x3): Likewise.

>>>>>          (vld1q_f16_x3): Likewise.

>>>>>          (vld1q_f32_x3): Likewise.

>>>>>          (vld1q_f64_x3): Likewise.

>>>>>          (vld1q_p8_x3): Likewise.

>>>>>          (vld1q_p16_x3): Likewise.

>>>>>          (vld1q_p64_x3): Likewise.

>>>>>          (vst1_s64_x2): Likewise.

>>>>>          (vst1_u64_x2): Likewise.

>>>>>          (vst1_f64_x2): Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

>> patchname=armv8_2-fp16-scalar-2.patch3

>> refrev=259064

>> email_to=christophe.lyon@linaro.org

>>

>>>>>          (vst1_s8_x2): Likewise.

>>>>>          (vst1_p8_x2): Likewise.

>>>>>          (vst1_s16_x2): Likewise.

>>>>>          (vst1_p16_x2): Likewise.

>>>>>          (vst1_s32_x2): Likewise.

>>>>>          (vst1_u8_x2): Likewise.

>>>>>          (vst1_u16_x2): Likewise.

>>>>>          (vst1_u32_x2): Likewise.

>>>>>          (vst1_f16_x2): Likewise.

>>>>>          (vst1_f32_x2): Likewise.

>>>>>          (vst1_p64_x2): Likewise.

>>>>>          (vst1q_s8_x2): Likewise.

>>>>>          (vst1q_p8_x2): Likewise.

>>>>>          (vst1q_s16_x2): Likewise.

>>>>>          (vst1q_p16_x2): Likewise.

>>>>>          (vst1q_s32_x2): Likewise.

>>>>>          (vst1q_s64_x2): Likewise.

>>>>>          (vst1q_u8_x2): Likewise.

>>>>>          (vst1q_u16_x2): Likewise.

>>>>>          (vst1q_u32_x2): Likewise.

>>>>>          (vst1q_u64_x2): Likewise.

>>>>>          (vst1q_f16_x2): Likewise.

>>>>>          (vst1q_f32_x2): Likewise.

>>>>>          (vst1q_f64_x2): Likewise.

>>>>>          (vst1q_p64_x2): Likewise.

>>>>>          (vst1_s64_x3): Likewise.

>>>>>          (vst1_u64_x3): Likewise.

>>>>>          (vst1_f64_x3): Likewise.

>>>>>          (vst1_s8_x3): Likewise.

>>>>>          (vst1_p8_x3): Likewise.

>>>>>          (vst1_s16_x3): Likewise.

>>>>>          (vst1_p16_x3): Likewise.

>>>>>          (vst1_s32_x3): Likewise.

>>>>>          (vst1_u8_x3): Likewise.

>>>>>          (vst1_u16_x3): Likewise.

>>>>>          (vst1_u32_x3): Likewise.

>>>>>          (vst1_f16_x3): Likewise.

>>>>>          (vst1_f32_x3): Likewise.

>>>>>          (vst1_p64_x3): Likewise.

>>>>>          (vst1q_s8_x3): Likewise.

>>>>>          (vst1q_p8_x3): Likewise.

>>>>>          (vst1q_s16_x3): Likewise.

>>>>>          (vst1q_p16_x3): Likewise.

>>>>>          (vst1q_s32_x3): Likewise.

>>>>>          (vst1q_s64_x3): Likewise.

>>>>>          (vst1q_u8_x3): Likewise.

>>>>>          (vst1q_u16_x3): Likewise.

>>>>>          (vst1q_u32_x3): Likewise.

>>>>>          (vst1q_u64_x3): Likewise.

>>>>>          (vst1q_f16_x3): Likewise.

>>>>>          (vst1q_f32_x3): Likewise.

>>>>>          (vst1q_f64_x3): Likewise.

>>>>>          (vst1q_p64_x3): Likewise.

>>>>

>>>> Hi,

>>>> I'm not a maintainer, but I suspect you should add some tests.

>>>>

>>>> Christophe

>>>

>>>

>>>

>>> --

>>> - Thanks and regards,

>>>    Sameera D.

> 

> Hi Christophe,

> 

> Please find attached the updated patch. Similar to the testcase

> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as

> the intrinsics are not implemented yet. I have also added required

> target to be little endian.


I am not a maintainer either. Shouldn't these intrinsics be supported
even for big endian?

 From your patch:

> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

new file mode 100644
index 0000000..c37c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-require-effective-target aarch64_little_endian } */

According to 
https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets 
this must follow the dg-do directive.

Also I think the require-effective-target directive will only allow
the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind
of not helpful. Maybe something like:

/* { dg-require-effective-target aarch64_little_endian { target { 
aarch64*-*-* } } } */

So that the check is not performed on arm-*-*-* and the test runs
and fails and then the xfail makes more sense.

In case the big endian version is also expected to be implemented
in the future, something like:

/* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */

or

/* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */

would be simpler. (PS: I haven't tested any of these directive myself).


Thanks
Sudi


+/* { dg-do run } */
+/* { dg-options "-O3" } */


> Ok for thrunk?

> 

> - Thanks and regards,

>    Sameera D.

> 

> gcc/Changelog:

> 

> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

> 

> 

>          * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>          (st1x2): Likewise.

>          (st1x3): Likewise.

>          * config/aarch64/aarch64-simd.md

>          (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>          (aarch64_ld1_x3_<mode>): Likewise

>          (aarch64_st1x2<VALLDIF:mode>): Likewise

>          (aarch64_st1_x2_<mode>): Likewise

>          (aarch64_st1x3<VALLDIF:mode>): Likewise

>          (aarch64_st1_x3_<mode>): Likewise

>          * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>          (vld1_s8_x3): Likewise.

>          (vld1_u16_x3): Likewise.

>          (vld1_s16_x3): Likewise.

>          (vld1_u32_x3): Likewise.

>          (vld1_s32_x3): Likewise.

>          (vld1_u64_x3): Likewise.

>          (vld1_s64_x3): Likewise.

>          (vld1_f16_x3): Likewise.

>          (vld1_f32_x3): Likewise.

>          (vld1_f64_x3): Likewise.

>          (vld1_p8_x3): Likewise.

>          (vld1_p16_x3): Likewise.

>          (vld1_p64_x3): Likewise.

>          (vld1q_u8_x3): Likewise.

>          (vld1q_s8_x3): Likewise.

>          (vld1q_u16_x3): Likewise.

>          (vld1q_s16_x3): Likewise.

>          (vld1q_u32_x3): Likewise.

>          (vld1q_s32_x3): Likewise.

>          (vld1q_u64_x3): Likewise.

>          (vld1q_s64_x3): Likewise.

>          (vld1q_f16_x3): Likewise.

>          (vld1q_f32_x3): Likewise.

>          (vld1q_f64_x3): Likewise.

>          (vld1q_p8_x3): Likewise.

>          (vld1q_p16_x3): Likewise.

>          (vld1q_p64_x3): Likewise.

>          (vst1_s64_x2): Likewise.

>          (vst1_u64_x2): Likewise.

>          (vst1_f64_x2): Likewise.

>          (vst1_s8_x2): Likewise.

>          (vst1_p8_x2): Likewise.

>          (vst1_s16_x2): Likewise.

>          (vst1_p16_x2): Likewise.

>          (vst1_s32_x2): Likewise.

>          (vst1_u8_x2): Likewise.

>          (vst1_u16_x2): Likewise.

>          (vst1_u32_x2): Likewise.

>          (vst1_f16_x2): Likewise.

>          (vst1_f32_x2): Likewise.

>          (vst1_p64_x2): Likewise.

>          (vst1q_s8_x2): Likewise.

>          (vst1q_p8_x2): Likewise.

>          (vst1q_s16_x2): Likewise.

>          (vst1q_p16_x2): Likewise.

>          (vst1q_s32_x2): Likewise.

>          (vst1q_s64_x2): Likewise.

>          (vst1q_u8_x2): Likewise.

>          (vst1q_u16_x2): Likewise.

>          (vst1q_u32_x2): Likewise.

>          (vst1q_u64_x2): Likewise.

>          (vst1q_f16_x2): Likewise.

>          (vst1q_f32_x2): Likewise.

>          (vst1q_f64_x2): Likewise.

>          (vst1q_p64_x2): Likewise.

>          (vst1_s64_x3): Likewise.

>          (vst1_u64_x3): Likewise.

>          (vst1_f64_x3): Likewise.

>          (vst1_s8_x3): Likewise.

>          (vst1_p8_x3): Likewise.

>          (vst1_s16_x3): Likewise.

>          (vst1_p16_x3): Likewise.

>          (vst1_s32_x3): Likewise.

>          (vst1_u8_x3): Likewise.

>          (vst1_u16_x3): Likewise.

>          (vst1_u32_x3): Likewise.

>          (vst1_f16_x3): Likewise.

>          (vst1_f32_x3): Likewise.

>          (vst1_p64_x3): Likewise.

>          (vst1q_s8_x3): Likewise.

>          (vst1q_p8_x3): Likewise.

>          (vst1q_s16_x3): Likewise.

>          (vst1q_p16_x3): Likewise.

>          (vst1q_s32_x3): Likewise.

>          (vst1q_s64_x3): Likewise.

>          (vst1q_u8_x3): Likewise.

>          (vst1q_u16_x3): Likewise.

>          (vst1q_u32_x3): Likewise.

>          (vst1q_u64_x3): Likewise.

>          (vst1q_f16_x3): Likewise.

>          (vst1q_f32_x3): Likewise.

>          (vst1q_f64_x3): Likewise.

>          (vst1q_p64_x3): Likewise.

> 

> gcc/testsuite/Changelog:

> 

> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

> 

>          * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

> vld1x3 intrinsics for aarch64_little_endian.

>          * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

> vst1x2 intrinsics for aarch64_little_endian.

>          * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

> vst1x3 intrinsics for aarch64_little_endian.

>
Sameera Deshpande April 11, 2018, 8:04 a.m. | #6
On 10 April 2018 at 20:07, Sudakshina Das <sudi.das@arm.com> wrote:
> Hi Sameera

>

>

> On 10/04/18 11:20, Sameera Deshpande wrote:

>>

>> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org>

>> wrote:

>>>

>>> Hi,

>>>

>>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande

>>> <sameera.deshpande@linaro.org>:

>>>>

>>>> Hi Christophe,

>>>>

>>>> Please find attached the updated patch with testcases.

>>>>

>>>> Ok for trunk?

>>>

>>>

>>> Thanks for the update.

>>>

>>> Since the new intrinsics are only available on aarch64, you want to

>>> prevent the tests from running on arm.

>>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two

>>> targets.

>>> There are several examples on how to do that in that directory.

>>>

>>> I have also noticed that the tests fail at execution on aarch64_be.

>>>

>>> I didn't look at the patch in details.

>>>

>>> Christophe

>>>

>>>

>>>>

>>>> - Thanks and regards,

>>>>    Sameera D.

>>>>

>>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:

>>>>>

>>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande

>>>>> <sameera.deshpande@linaro.org>:

>>>>>>

>>>>>> Hi!

>>>>>>

>>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>>>

>>>>>> Ok for trunk?

>>>>>>

>>>>>> - Thanks and regards,

>>>>>>    Sameera D.

>>>>>>

>>>>>> gcc/Changelog:

>>>>>>

>>>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>>

>>>>>>

>>>>>>          * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>>          (st1x2): Likewise.

>>>>>>          (st1x3): Likewise.

>>>>>>          * config/aarch64/aarch64-simd.md

>>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>>          (aarch64_ld1_x3_<mode>): Likewise

>>>>>>          (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>>          (aarch64_st1_x2_<mode>): Likewise

>>>>>>          (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>>          (aarch64_st1_x3_<mode>): Likewise

>>>>>>          * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>>          (vld1_s8_x3): Likewise.

>>>>>>          (vld1_u16_x3): Likewise.

>>>>>>          (vld1_s16_x3): Likewise.

>>>>>>          (vld1_u32_x3): Likewise.

>>>>>>          (vld1_s32_x3): Likewise.

>>>>>>          (vld1_u64_x3): Likewise.

>>>>>>          (vld1_s64_x3): Likewise.

>>>>>>          (vld1_fp16_x3): Likewise.

>>>>>>          (vld1_f32_x3): Likewise.

>>>>>>          (vld1_f64_x3): Likewise.

>>>>>>          (vld1_p8_x3): Likewise.

>>>>>>          (vld1_p16_x3): Likewise.

>>>>>>          (vld1_p64_x3): Likewise.

>>>>>>          (vld1q_u8_x3): Likewise.

>>>>>>          (vld1q_s8_x3): Likewise.

>>>>>>          (vld1q_u16_x3): Likewise.

>>>>>>          (vld1q_s16_x3): Likewise.

>>>>>>          (vld1q_u32_x3): Likewise.

>>>>>>          (vld1q_s32_x3): Likewise.

>>>>>>          (vld1q_u64_x3): Likewise.

>>>>>>          (vld1q_s64_x3): Likewise.

>>>>>>          (vld1q_f16_x3): Likewise.

>>>>>>          (vld1q_f32_x3): Likewise.

>>>>>>          (vld1q_f64_x3): Likewise.

>>>>>>          (vld1q_p8_x3): Likewise.

>>>>>>          (vld1q_p16_x3): Likewise.

>>>>>>          (vld1q_p64_x3): Likewise.

>>>>>>          (vst1_s64_x2): Likewise.

>>>>>>          (vst1_u64_x2): Likewise.

>>>>>>          (vst1_f64_x2):

>>>>>> Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

>>>

>>> patchname=armv8_2-fp16-scalar-2.patch3

>>> refrev=259064

>>> email_to=christophe.lyon@linaro.org

>>>

>>>>>>          (vst1_s8_x2): Likewise.

>>>>>>          (vst1_p8_x2): Likewise.

>>>>>>          (vst1_s16_x2): Likewise.

>>>>>>          (vst1_p16_x2): Likewise.

>>>>>>          (vst1_s32_x2): Likewise.

>>>>>>          (vst1_u8_x2): Likewise.

>>>>>>          (vst1_u16_x2): Likewise.

>>>>>>          (vst1_u32_x2): Likewise.

>>>>>>          (vst1_f16_x2): Likewise.

>>>>>>          (vst1_f32_x2): Likewise.

>>>>>>          (vst1_p64_x2): Likewise.

>>>>>>          (vst1q_s8_x2): Likewise.

>>>>>>          (vst1q_p8_x2): Likewise.

>>>>>>          (vst1q_s16_x2): Likewise.

>>>>>>          (vst1q_p16_x2): Likewise.

>>>>>>          (vst1q_s32_x2): Likewise.

>>>>>>          (vst1q_s64_x2): Likewise.

>>>>>>          (vst1q_u8_x2): Likewise.

>>>>>>          (vst1q_u16_x2): Likewise.

>>>>>>          (vst1q_u32_x2): Likewise.

>>>>>>          (vst1q_u64_x2): Likewise.

>>>>>>          (vst1q_f16_x2): Likewise.

>>>>>>          (vst1q_f32_x2): Likewise.

>>>>>>          (vst1q_f64_x2): Likewise.

>>>>>>          (vst1q_p64_x2): Likewise.

>>>>>>          (vst1_s64_x3): Likewise.

>>>>>>          (vst1_u64_x3): Likewise.

>>>>>>          (vst1_f64_x3): Likewise.

>>>>>>          (vst1_s8_x3): Likewise.

>>>>>>          (vst1_p8_x3): Likewise.

>>>>>>          (vst1_s16_x3): Likewise.

>>>>>>          (vst1_p16_x3): Likewise.

>>>>>>          (vst1_s32_x3): Likewise.

>>>>>>          (vst1_u8_x3): Likewise.

>>>>>>          (vst1_u16_x3): Likewise.

>>>>>>          (vst1_u32_x3): Likewise.

>>>>>>          (vst1_f16_x3): Likewise.

>>>>>>          (vst1_f32_x3): Likewise.

>>>>>>          (vst1_p64_x3): Likewise.

>>>>>>          (vst1q_s8_x3): Likewise.

>>>>>>          (vst1q_p8_x3): Likewise.

>>>>>>          (vst1q_s16_x3): Likewise.

>>>>>>          (vst1q_p16_x3): Likewise.

>>>>>>          (vst1q_s32_x3): Likewise.

>>>>>>          (vst1q_s64_x3): Likewise.

>>>>>>          (vst1q_u8_x3): Likewise.

>>>>>>          (vst1q_u16_x3): Likewise.

>>>>>>          (vst1q_u32_x3): Likewise.

>>>>>>          (vst1q_u64_x3): Likewise.

>>>>>>          (vst1q_f16_x3): Likewise.

>>>>>>          (vst1q_f32_x3): Likewise.

>>>>>>          (vst1q_f64_x3): Likewise.

>>>>>>          (vst1q_p64_x3): Likewise.

>>>>>

>>>>>

>>>>> Hi,

>>>>> I'm not a maintainer, but I suspect you should add some tests.

>>>>>

>>>>> Christophe

>>>>

>>>>

>>>>

>>>>

>>>> --

>>>> - Thanks and regards,

>>>>    Sameera D.

>>

>>

>> Hi Christophe,

>>

>> Please find attached the updated patch. Similar to the testcase

>> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as

>> the intrinsics are not implemented yet. I have also added required

>> target to be little endian.

>

>

> I am not a maintainer either. Shouldn't these intrinsics be supported

> even for big endian?

>


Yes, they should be implemented, however it is out of scope of this patch.

> From your patch:

>

>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>

> new file mode 100644

> index 0000000..c37c72c

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

> @@ -0,0 +1,82 @@

> +/* We haven't implemented these intrinsics for arm yet.  */

> +/* { dg-xfail-if "" { arm*-*-* } } */

> +/* { dg-require-effective-target aarch64_little_endian } */

>

> According to

> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

> this must follow the dg-do directive.

>

> Also I think the require-effective-target directive will only allow

> the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind

> of not helpful. Maybe something like:

>

> /* { dg-require-effective-target aarch64_little_endian { target {

> aarch64*-*-* } } } */


Ok, then I will restrict this test only for aarch64_little_endian, as
that is the purpose of this test.

>

> So that the check is not performed on arm-*-*-* and the test runs

> and fails and then the xfail makes more sense.

>

> In case the big endian version is also expected to be implemented

> in the future, something like:

>

> /* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */

>

> or

>

> /* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */

>

> would be simpler. (PS: I haven't tested any of these directive myself).

>


Please find attached updated patch.

>

> Thanks

> Sudi

>

>

> +/* { dg-do run } */

> +/* { dg-options "-O3" } */

>

>

>

>> Ok for thrunk?

>>

>> - Thanks and regards,

>>    Sameera D.

>>

>> gcc/Changelog:

>>

>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>

>>

>>          * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>          (st1x2): Likewise.

>>          (st1x3): Likewise.

>>          * config/aarch64/aarch64-simd.md

>>          (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>          (aarch64_ld1_x3_<mode>): Likewise

>>          (aarch64_st1x2<VALLDIF:mode>): Likewise

>>          (aarch64_st1_x2_<mode>): Likewise

>>          (aarch64_st1x3<VALLDIF:mode>): Likewise

>>          (aarch64_st1_x3_<mode>): Likewise

>>          * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>          (vld1_s8_x3): Likewise.

>>          (vld1_u16_x3): Likewise.

>>          (vld1_s16_x3): Likewise.

>>          (vld1_u32_x3): Likewise.

>>          (vld1_s32_x3): Likewise.

>>          (vld1_u64_x3): Likewise.

>>          (vld1_s64_x3): Likewise.

>>          (vld1_f16_x3): Likewise.

>>          (vld1_f32_x3): Likewise.

>>          (vld1_f64_x3): Likewise.

>>          (vld1_p8_x3): Likewise.

>>          (vld1_p16_x3): Likewise.

>>          (vld1_p64_x3): Likewise.

>>          (vld1q_u8_x3): Likewise.

>>          (vld1q_s8_x3): Likewise.

>>          (vld1q_u16_x3): Likewise.

>>          (vld1q_s16_x3): Likewise.

>>          (vld1q_u32_x3): Likewise.

>>          (vld1q_s32_x3): Likewise.

>>          (vld1q_u64_x3): Likewise.

>>          (vld1q_s64_x3): Likewise.

>>          (vld1q_f16_x3): Likewise.

>>          (vld1q_f32_x3): Likewise.

>>          (vld1q_f64_x3): Likewise.

>>          (vld1q_p8_x3): Likewise.

>>          (vld1q_p16_x3): Likewise.

>>          (vld1q_p64_x3): Likewise.

>>          (vst1_s64_x2): Likewise.

>>          (vst1_u64_x2): Likewise.

>>          (vst1_f64_x2): Likewise.

>>          (vst1_s8_x2): Likewise.

>>          (vst1_p8_x2): Likewise.

>>          (vst1_s16_x2): Likewise.

>>          (vst1_p16_x2): Likewise.

>>          (vst1_s32_x2): Likewise.

>>          (vst1_u8_x2): Likewise.

>>          (vst1_u16_x2): Likewise.

>>          (vst1_u32_x2): Likewise.

>>          (vst1_f16_x2): Likewise.

>>          (vst1_f32_x2): Likewise.

>>          (vst1_p64_x2): Likewise.

>>          (vst1q_s8_x2): Likewise.

>>          (vst1q_p8_x2): Likewise.

>>          (vst1q_s16_x2): Likewise.

>>          (vst1q_p16_x2): Likewise.

>>          (vst1q_s32_x2): Likewise.

>>          (vst1q_s64_x2): Likewise.

>>          (vst1q_u8_x2): Likewise.

>>          (vst1q_u16_x2): Likewise.

>>          (vst1q_u32_x2): Likewise.

>>          (vst1q_u64_x2): Likewise.

>>          (vst1q_f16_x2): Likewise.

>>          (vst1q_f32_x2): Likewise.

>>          (vst1q_f64_x2): Likewise.

>>          (vst1q_p64_x2): Likewise.

>>          (vst1_s64_x3): Likewise.

>>          (vst1_u64_x3): Likewise.

>>          (vst1_f64_x3): Likewise.

>>          (vst1_s8_x3): Likewise.

>>          (vst1_p8_x3): Likewise.

>>          (vst1_s16_x3): Likewise.

>>          (vst1_p16_x3): Likewise.

>>          (vst1_s32_x3): Likewise.

>>          (vst1_u8_x3): Likewise.

>>          (vst1_u16_x3): Likewise.

>>          (vst1_u32_x3): Likewise.

>>          (vst1_f16_x3): Likewise.

>>          (vst1_f32_x3): Likewise.

>>          (vst1_p64_x3): Likewise.

>>          (vst1q_s8_x3): Likewise.

>>          (vst1q_p8_x3): Likewise.

>>          (vst1q_s16_x3): Likewise.

>>          (vst1q_p16_x3): Likewise.

>>          (vst1q_s32_x3): Likewise.

>>          (vst1q_s64_x3): Likewise.

>>          (vst1q_u8_x3): Likewise.

>>          (vst1q_u16_x3): Likewise.

>>          (vst1q_u32_x3): Likewise.

>>          (vst1q_u64_x3): Likewise.

>>          (vst1q_f16_x3): Likewise.

>>          (vst1q_f32_x3): Likewise.

>>          (vst1q_f64_x3): Likewise.

>>          (vst1q_p64_x3): Likewise.

>>

>> gcc/testsuite/Changelog:

>>

>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>

>>          * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

>> vld1x3 intrinsics for aarch64_little_endian.

>>          * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

>> vst1x2 intrinsics for aarch64_little_endian.

>>          * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

>> vst1x3 intrinsics for aarch64_little_endian.

>>

>




-- 
- Thanks and regards,
  Sameera D.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f24..2fd072a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..e197a67 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5047,6 +5047,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI 
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") 
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29a..6ac7099 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 0000000..c37c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vld##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors = vld1##SUFFIX##_x3 (data);		\
+  vst1##SUFFIX (temp, vectors.val[0]);		\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j]) 			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vld##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 0000000..3b6797c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,80 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x2 ()			\
+{						\
+  BASE##_t data[ELTS * 2];			\
+  BASE##_t temp[ELTS * 2];			\
+  BASE##x##ELTS##x##2##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 2; i++)		\
+    data [i] = (BASE##_t) 2*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vst1##SUFFIX##_x2 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 2; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x2 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 0000000..1709115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,81 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vst1##SUFFIX##_x3 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
Sudakshina Das April 11, 2018, 10:23 a.m. | #7
Hi Sameera

On 11/04/18 09:04, Sameera Deshpande wrote:
> On 10 April 2018 at 20:07, Sudakshina Das <sudi.das@arm.com> wrote:

>> Hi Sameera

>>

>>

>> On 10/04/18 11:20, Sameera Deshpande wrote:

>>>

>>> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org>

>>> wrote:

>>>>

>>>> Hi,

>>>>

>>>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande

>>>> <sameera.deshpande@linaro.org>:

>>>>>

>>>>> Hi Christophe,

>>>>>

>>>>> Please find attached the updated patch with testcases.

>>>>>

>>>>> Ok for trunk?

>>>>

>>>>

>>>> Thanks for the update.

>>>>

>>>> Since the new intrinsics are only available on aarch64, you want to

>>>> prevent the tests from running on arm.

>>>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two

>>>> targets.

>>>> There are several examples on how to do that in that directory.

>>>>

>>>> I have also noticed that the tests fail at execution on aarch64_be.

>>>>

>>>> I didn't look at the patch in details.

>>>>

>>>> Christophe

>>>>

>>>>

>>>>>

>>>>> - Thanks and regards,

>>>>>     Sameera D.

>>>>>

>>>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon <christophe.lyon@linaro.org>:

>>>>>>

>>>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande

>>>>>> <sameera.deshpande@linaro.org>:

>>>>>>>

>>>>>>> Hi!

>>>>>>>

>>>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>>>>

>>>>>>> Ok for trunk?

>>>>>>>

>>>>>>> - Thanks and regards,

>>>>>>>     Sameera D.

>>>>>>>

>>>>>>> gcc/Changelog:

>>>>>>>

>>>>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>>>

>>>>>>>

>>>>>>>           * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>>>           (st1x2): Likewise.

>>>>>>>           (st1x3): Likewise.

>>>>>>>           * config/aarch64/aarch64-simd.md

>>>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>>>           (aarch64_ld1_x3_<mode>): Likewise

>>>>>>>           (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>>>           (aarch64_st1_x2_<mode>): Likewise

>>>>>>>           (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>>>           (aarch64_st1_x3_<mode>): Likewise

>>>>>>>           * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>>>           (vld1_s8_x3): Likewise.

>>>>>>>           (vld1_u16_x3): Likewise.

>>>>>>>           (vld1_s16_x3): Likewise.

>>>>>>>           (vld1_u32_x3): Likewise.

>>>>>>>           (vld1_s32_x3): Likewise.

>>>>>>>           (vld1_u64_x3): Likewise.

>>>>>>>           (vld1_s64_x3): Likewise.

>>>>>>>           (vld1_fp16_x3): Likewise.

>>>>>>>           (vld1_f32_x3): Likewise.

>>>>>>>           (vld1_f64_x3): Likewise.

>>>>>>>           (vld1_p8_x3): Likewise.

>>>>>>>           (vld1_p16_x3): Likewise.

>>>>>>>           (vld1_p64_x3): Likewise.

>>>>>>>           (vld1q_u8_x3): Likewise.

>>>>>>>           (vld1q_s8_x3): Likewise.

>>>>>>>           (vld1q_u16_x3): Likewise.

>>>>>>>           (vld1q_s16_x3): Likewise.

>>>>>>>           (vld1q_u32_x3): Likewise.

>>>>>>>           (vld1q_s32_x3): Likewise.

>>>>>>>           (vld1q_u64_x3): Likewise.

>>>>>>>           (vld1q_s64_x3): Likewise.

>>>>>>>           (vld1q_f16_x3): Likewise.

>>>>>>>           (vld1q_f32_x3): Likewise.

>>>>>>>           (vld1q_f64_x3): Likewise.

>>>>>>>           (vld1q_p8_x3): Likewise.

>>>>>>>           (vld1q_p16_x3): Likewise.

>>>>>>>           (vld1q_p64_x3): Likewise.

>>>>>>>           (vst1_s64_x2): Likewise.

>>>>>>>           (vst1_u64_x2): Likewise.

>>>>>>>           (vst1_f64_x2):

>>>>>>> Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

>>>>

>>>> patchname=armv8_2-fp16-scalar-2.patch3

>>>> refrev=259064

>>>> email_to=christophe.lyon@linaro.org

>>>>

>>>>>>>           (vst1_s8_x2): Likewise.

>>>>>>>           (vst1_p8_x2): Likewise.

>>>>>>>           (vst1_s16_x2): Likewise.

>>>>>>>           (vst1_p16_x2): Likewise.

>>>>>>>           (vst1_s32_x2): Likewise.

>>>>>>>           (vst1_u8_x2): Likewise.

>>>>>>>           (vst1_u16_x2): Likewise.

>>>>>>>           (vst1_u32_x2): Likewise.

>>>>>>>           (vst1_f16_x2): Likewise.

>>>>>>>           (vst1_f32_x2): Likewise.

>>>>>>>           (vst1_p64_x2): Likewise.

>>>>>>>           (vst1q_s8_x2): Likewise.

>>>>>>>           (vst1q_p8_x2): Likewise.

>>>>>>>           (vst1q_s16_x2): Likewise.

>>>>>>>           (vst1q_p16_x2): Likewise.

>>>>>>>           (vst1q_s32_x2): Likewise.

>>>>>>>           (vst1q_s64_x2): Likewise.

>>>>>>>           (vst1q_u8_x2): Likewise.

>>>>>>>           (vst1q_u16_x2): Likewise.

>>>>>>>           (vst1q_u32_x2): Likewise.

>>>>>>>           (vst1q_u64_x2): Likewise.

>>>>>>>           (vst1q_f16_x2): Likewise.

>>>>>>>           (vst1q_f32_x2): Likewise.

>>>>>>>           (vst1q_f64_x2): Likewise.

>>>>>>>           (vst1q_p64_x2): Likewise.

>>>>>>>           (vst1_s64_x3): Likewise.

>>>>>>>           (vst1_u64_x3): Likewise.

>>>>>>>           (vst1_f64_x3): Likewise.

>>>>>>>           (vst1_s8_x3): Likewise.

>>>>>>>           (vst1_p8_x3): Likewise.

>>>>>>>           (vst1_s16_x3): Likewise.

>>>>>>>           (vst1_p16_x3): Likewise.

>>>>>>>           (vst1_s32_x3): Likewise.

>>>>>>>           (vst1_u8_x3): Likewise.

>>>>>>>           (vst1_u16_x3): Likewise.

>>>>>>>           (vst1_u32_x3): Likewise.

>>>>>>>           (vst1_f16_x3): Likewise.

>>>>>>>           (vst1_f32_x3): Likewise.

>>>>>>>           (vst1_p64_x3): Likewise.

>>>>>>>           (vst1q_s8_x3): Likewise.

>>>>>>>           (vst1q_p8_x3): Likewise.

>>>>>>>           (vst1q_s16_x3): Likewise.

>>>>>>>           (vst1q_p16_x3): Likewise.

>>>>>>>           (vst1q_s32_x3): Likewise.

>>>>>>>           (vst1q_s64_x3): Likewise.

>>>>>>>           (vst1q_u8_x3): Likewise.

>>>>>>>           (vst1q_u16_x3): Likewise.

>>>>>>>           (vst1q_u32_x3): Likewise.

>>>>>>>           (vst1q_u64_x3): Likewise.

>>>>>>>           (vst1q_f16_x3): Likewise.

>>>>>>>           (vst1q_f32_x3): Likewise.

>>>>>>>           (vst1q_f64_x3): Likewise.

>>>>>>>           (vst1q_p64_x3): Likewise.

>>>>>>

>>>>>>

>>>>>> Hi,

>>>>>> I'm not a maintainer, but I suspect you should add some tests.

>>>>>>

>>>>>> Christophe

>>>>>

>>>>>

>>>>>

>>>>>

>>>>> --

>>>>> - Thanks and regards,

>>>>>     Sameera D.

>>>

>>>

>>> Hi Christophe,

>>>

>>> Please find attached the updated patch. Similar to the testcase

>>> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as

>>> the intrinsics are not implemented yet. I have also added required

>>> target to be little endian.

>>

>>

>> I am not a maintainer either. Shouldn't these intrinsics be supported

>> even for big endian?

>>

> 

> Yes, they should be implemented, however it is out of scope of this patch.

> 

>>  From your patch:

>>

>>> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>

>> new file mode 100644

>> index 0000000..c37c72c

>> --- /dev/null

>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>> @@ -0,0 +1,82 @@

>> +/* We haven't implemented these intrinsics for arm yet.  */

>> +/* { dg-xfail-if "" { arm*-*-* } } */

>> +/* { dg-require-effective-target aarch64_little_endian } */

>>

>> According to

>> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

>> this must follow the dg-do directive.

>>

>> Also I think the require-effective-target directive will only allow

>> the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind

>> of not helpful. Maybe something like:

>>

>> /* { dg-require-effective-target aarch64_little_endian { target {

>> aarch64*-*-* } } } */

> 

> Ok, then I will restrict this test only for aarch64_little_endian, as

> that is the purpose of this test.

> 

>>

>> So that the check is not performed on arm-*-*-* and the test runs

>> and fails and then the xfail makes more sense.

>>

>> In case the big endian version is also expected to be implemented

>> in the future, something like:

>>

>> /* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */

>>

>> or

>>

>> /* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */

>>

>> would be simpler. (PS: I haven't tested any of these directive myself).

>>

> 

> Please find attached updated patch.


Thank you for making the edits. One last nit from my side, as I
mentioned earlier the dg-require-effective directive should go after
the dg-do directive as mentioned here:
https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

Thanks
Sudi

> 

>>

>> Thanks

>> Sudi

>>

>>

>> +/* { dg-do run } */

>> +/* { dg-options "-O3" } */

>>

>>

>>

>>> Ok for thrunk?

>>>

>>> - Thanks and regards,

>>>     Sameera D.

>>>

>>> gcc/Changelog:

>>>

>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>

>>>

>>>           * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>           (st1x2): Likewise.

>>>           (st1x3): Likewise.

>>>           * config/aarch64/aarch64-simd.md

>>>           (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>           (aarch64_ld1_x3_<mode>): Likewise

>>>           (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>           (aarch64_st1_x2_<mode>): Likewise

>>>           (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>           (aarch64_st1_x3_<mode>): Likewise

>>>           * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>           (vld1_s8_x3): Likewise.

>>>           (vld1_u16_x3): Likewise.

>>>           (vld1_s16_x3): Likewise.

>>>           (vld1_u32_x3): Likewise.

>>>           (vld1_s32_x3): Likewise.

>>>           (vld1_u64_x3): Likewise.

>>>           (vld1_s64_x3): Likewise.

>>>           (vld1_f16_x3): Likewise.

>>>           (vld1_f32_x3): Likewise.

>>>           (vld1_f64_x3): Likewise.

>>>           (vld1_p8_x3): Likewise.

>>>           (vld1_p16_x3): Likewise.

>>>           (vld1_p64_x3): Likewise.

>>>           (vld1q_u8_x3): Likewise.

>>>           (vld1q_s8_x3): Likewise.

>>>           (vld1q_u16_x3): Likewise.

>>>           (vld1q_s16_x3): Likewise.

>>>           (vld1q_u32_x3): Likewise.

>>>           (vld1q_s32_x3): Likewise.

>>>           (vld1q_u64_x3): Likewise.

>>>           (vld1q_s64_x3): Likewise.

>>>           (vld1q_f16_x3): Likewise.

>>>           (vld1q_f32_x3): Likewise.

>>>           (vld1q_f64_x3): Likewise.

>>>           (vld1q_p8_x3): Likewise.

>>>           (vld1q_p16_x3): Likewise.

>>>           (vld1q_p64_x3): Likewise.

>>>           (vst1_s64_x2): Likewise.

>>>           (vst1_u64_x2): Likewise.

>>>           (vst1_f64_x2): Likewise.

>>>           (vst1_s8_x2): Likewise.

>>>           (vst1_p8_x2): Likewise.

>>>           (vst1_s16_x2): Likewise.

>>>           (vst1_p16_x2): Likewise.

>>>           (vst1_s32_x2): Likewise.

>>>           (vst1_u8_x2): Likewise.

>>>           (vst1_u16_x2): Likewise.

>>>           (vst1_u32_x2): Likewise.

>>>           (vst1_f16_x2): Likewise.

>>>           (vst1_f32_x2): Likewise.

>>>           (vst1_p64_x2): Likewise.

>>>           (vst1q_s8_x2): Likewise.

>>>           (vst1q_p8_x2): Likewise.

>>>           (vst1q_s16_x2): Likewise.

>>>           (vst1q_p16_x2): Likewise.

>>>           (vst1q_s32_x2): Likewise.

>>>           (vst1q_s64_x2): Likewise.

>>>           (vst1q_u8_x2): Likewise.

>>>           (vst1q_u16_x2): Likewise.

>>>           (vst1q_u32_x2): Likewise.

>>>           (vst1q_u64_x2): Likewise.

>>>           (vst1q_f16_x2): Likewise.

>>>           (vst1q_f32_x2): Likewise.

>>>           (vst1q_f64_x2): Likewise.

>>>           (vst1q_p64_x2): Likewise.

>>>           (vst1_s64_x3): Likewise.

>>>           (vst1_u64_x3): Likewise.

>>>           (vst1_f64_x3): Likewise.

>>>           (vst1_s8_x3): Likewise.

>>>           (vst1_p8_x3): Likewise.

>>>           (vst1_s16_x3): Likewise.

>>>           (vst1_p16_x3): Likewise.

>>>           (vst1_s32_x3): Likewise.

>>>           (vst1_u8_x3): Likewise.

>>>           (vst1_u16_x3): Likewise.

>>>           (vst1_u32_x3): Likewise.

>>>           (vst1_f16_x3): Likewise.

>>>           (vst1_f32_x3): Likewise.

>>>           (vst1_p64_x3): Likewise.

>>>           (vst1q_s8_x3): Likewise.

>>>           (vst1q_p8_x3): Likewise.

>>>           (vst1q_s16_x3): Likewise.

>>>           (vst1q_p16_x3): Likewise.

>>>           (vst1q_s32_x3): Likewise.

>>>           (vst1q_s64_x3): Likewise.

>>>           (vst1q_u8_x3): Likewise.

>>>           (vst1q_u16_x3): Likewise.

>>>           (vst1q_u32_x3): Likewise.

>>>           (vst1q_u64_x3): Likewise.

>>>           (vst1q_f16_x3): Likewise.

>>>           (vst1q_f32_x3): Likewise.

>>>           (vst1q_f64_x3): Likewise.

>>>           (vst1q_p64_x3): Likewise.

>>>

>>> gcc/testsuite/Changelog:

>>>

>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>

>>>           * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

>>> vld1x3 intrinsics for aarch64_little_endian.

>>>           * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

>>> vst1x2 intrinsics for aarch64_little_endian.

>>>           * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

>>> vst1x3 intrinsics for aarch64_little_endian.

>>>

>>

> 

> 

>
Sameera Deshpande April 11, 2018, 12:05 p.m. | #8
On 11 April 2018 at 15:53, Sudakshina Das <sudi.das@arm.com> wrote:
> Hi Sameera

>

>

> On 11/04/18 09:04, Sameera Deshpande wrote:

>>

>> On 10 April 2018 at 20:07, Sudakshina Das <sudi.das@arm.com> wrote:

>>>

>>> Hi Sameera

>>>

>>>

>>> On 10/04/18 11:20, Sameera Deshpande wrote:

>>>>

>>>>

>>>> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org>

>>>> wrote:

>>>>>

>>>>>

>>>>> Hi,

>>>>>

>>>>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande

>>>>> <sameera.deshpande@linaro.org>:

>>>>>>

>>>>>>

>>>>>> Hi Christophe,

>>>>>>

>>>>>> Please find attached the updated patch with testcases.

>>>>>>

>>>>>> Ok for trunk?

>>>>>

>>>>>

>>>>>

>>>>> Thanks for the update.

>>>>>

>>>>> Since the new intrinsics are only available on aarch64, you want to

>>>>> prevent the tests from running on arm.

>>>>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two

>>>>> targets.

>>>>> There are several examples on how to do that in that directory.

>>>>>

>>>>> I have also noticed that the tests fail at execution on aarch64_be.

>>>>>

>>>>> I didn't look at the patch in details.

>>>>>

>>>>> Christophe

>>>>>

>>>>>

>>>>>>

>>>>>> - Thanks and regards,

>>>>>>     Sameera D.

>>>>>>

>>>>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon

>>>>>> <christophe.lyon@linaro.org>:

>>>>>>>

>>>>>>>

>>>>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande

>>>>>>> <sameera.deshpande@linaro.org>:

>>>>>>>>

>>>>>>>>

>>>>>>>> Hi!

>>>>>>>>

>>>>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>>>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>>>>>

>>>>>>>> Ok for trunk?

>>>>>>>>

>>>>>>>> - Thanks and regards,

>>>>>>>>     Sameera D.

>>>>>>>>

>>>>>>>> gcc/Changelog:

>>>>>>>>

>>>>>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>>>>

>>>>>>>>

>>>>>>>>           * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>>>>           (st1x2): Likewise.

>>>>>>>>           (st1x3): Likewise.

>>>>>>>>           * config/aarch64/aarch64-simd.md

>>>>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>>>>           (aarch64_ld1_x3_<mode>): Likewise

>>>>>>>>           (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>>>>           (aarch64_st1_x2_<mode>): Likewise

>>>>>>>>           (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>>>>           (aarch64_st1_x3_<mode>): Likewise

>>>>>>>>           * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>>>>           (vld1_s8_x3): Likewise.

>>>>>>>>           (vld1_u16_x3): Likewise.

>>>>>>>>           (vld1_s16_x3): Likewise.

>>>>>>>>           (vld1_u32_x3): Likewise.

>>>>>>>>           (vld1_s32_x3): Likewise.

>>>>>>>>           (vld1_u64_x3): Likewise.

>>>>>>>>           (vld1_s64_x3): Likewise.

>>>>>>>>           (vld1_fp16_x3): Likewise.

>>>>>>>>           (vld1_f32_x3): Likewise.

>>>>>>>>           (vld1_f64_x3): Likewise.

>>>>>>>>           (vld1_p8_x3): Likewise.

>>>>>>>>           (vld1_p16_x3): Likewise.

>>>>>>>>           (vld1_p64_x3): Likewise.

>>>>>>>>           (vld1q_u8_x3): Likewise.

>>>>>>>>           (vld1q_s8_x3): Likewise.

>>>>>>>>           (vld1q_u16_x3): Likewise.

>>>>>>>>           (vld1q_s16_x3): Likewise.

>>>>>>>>           (vld1q_u32_x3): Likewise.

>>>>>>>>           (vld1q_s32_x3): Likewise.

>>>>>>>>           (vld1q_u64_x3): Likewise.

>>>>>>>>           (vld1q_s64_x3): Likewise.

>>>>>>>>           (vld1q_f16_x3): Likewise.

>>>>>>>>           (vld1q_f32_x3): Likewise.

>>>>>>>>           (vld1q_f64_x3): Likewise.

>>>>>>>>           (vld1q_p8_x3): Likewise.

>>>>>>>>           (vld1q_p16_x3): Likewise.

>>>>>>>>           (vld1q_p64_x3): Likewise.

>>>>>>>>           (vst1_s64_x2): Likewise.

>>>>>>>>           (vst1_u64_x2): Likewise.

>>>>>>>>           (vst1_f64_x2):

>>>>>>>>

>>>>>>>> Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

>>>>>

>>>>>

>>>>> patchname=armv8_2-fp16-scalar-2.patch3

>>>>> refrev=259064

>>>>> email_to=christophe.lyon@linaro.org

>>>>>

>>>>>>>>           (vst1_s8_x2): Likewise.

>>>>>>>>           (vst1_p8_x2): Likewise.

>>>>>>>>           (vst1_s16_x2): Likewise.

>>>>>>>>           (vst1_p16_x2): Likewise.

>>>>>>>>           (vst1_s32_x2): Likewise.

>>>>>>>>           (vst1_u8_x2): Likewise.

>>>>>>>>           (vst1_u16_x2): Likewise.

>>>>>>>>           (vst1_u32_x2): Likewise.

>>>>>>>>           (vst1_f16_x2): Likewise.

>>>>>>>>           (vst1_f32_x2): Likewise.

>>>>>>>>           (vst1_p64_x2): Likewise.

>>>>>>>>           (vst1q_s8_x2): Likewise.

>>>>>>>>           (vst1q_p8_x2): Likewise.

>>>>>>>>           (vst1q_s16_x2): Likewise.

>>>>>>>>           (vst1q_p16_x2): Likewise.

>>>>>>>>           (vst1q_s32_x2): Likewise.

>>>>>>>>           (vst1q_s64_x2): Likewise.

>>>>>>>>           (vst1q_u8_x2): Likewise.

>>>>>>>>           (vst1q_u16_x2): Likewise.

>>>>>>>>           (vst1q_u32_x2): Likewise.

>>>>>>>>           (vst1q_u64_x2): Likewise.

>>>>>>>>           (vst1q_f16_x2): Likewise.

>>>>>>>>           (vst1q_f32_x2): Likewise.

>>>>>>>>           (vst1q_f64_x2): Likewise.

>>>>>>>>           (vst1q_p64_x2): Likewise.

>>>>>>>>           (vst1_s64_x3): Likewise.

>>>>>>>>           (vst1_u64_x3): Likewise.

>>>>>>>>           (vst1_f64_x3): Likewise.

>>>>>>>>           (vst1_s8_x3): Likewise.

>>>>>>>>           (vst1_p8_x3): Likewise.

>>>>>>>>           (vst1_s16_x3): Likewise.

>>>>>>>>           (vst1_p16_x3): Likewise.

>>>>>>>>           (vst1_s32_x3): Likewise.

>>>>>>>>           (vst1_u8_x3): Likewise.

>>>>>>>>           (vst1_u16_x3): Likewise.

>>>>>>>>           (vst1_u32_x3): Likewise.

>>>>>>>>           (vst1_f16_x3): Likewise.

>>>>>>>>           (vst1_f32_x3): Likewise.

>>>>>>>>           (vst1_p64_x3): Likewise.

>>>>>>>>           (vst1q_s8_x3): Likewise.

>>>>>>>>           (vst1q_p8_x3): Likewise.

>>>>>>>>           (vst1q_s16_x3): Likewise.

>>>>>>>>           (vst1q_p16_x3): Likewise.

>>>>>>>>           (vst1q_s32_x3): Likewise.

>>>>>>>>           (vst1q_s64_x3): Likewise.

>>>>>>>>           (vst1q_u8_x3): Likewise.

>>>>>>>>           (vst1q_u16_x3): Likewise.

>>>>>>>>           (vst1q_u32_x3): Likewise.

>>>>>>>>           (vst1q_u64_x3): Likewise.

>>>>>>>>           (vst1q_f16_x3): Likewise.

>>>>>>>>           (vst1q_f32_x3): Likewise.

>>>>>>>>           (vst1q_f64_x3): Likewise.

>>>>>>>>           (vst1q_p64_x3): Likewise.

>>>>>>>

>>>>>>>

>>>>>>>

>>>>>>> Hi,

>>>>>>> I'm not a maintainer, but I suspect you should add some tests.

>>>>>>>

>>>>>>> Christophe

>>>>>>

>>>>>>

>>>>>>

>>>>>>

>>>>>>

>>>>>> --

>>>>>> - Thanks and regards,

>>>>>>     Sameera D.

>>>>

>>>>

>>>>

>>>> Hi Christophe,

>>>>

>>>> Please find attached the updated patch. Similar to the testcase

>>>> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as

>>>> the intrinsics are not implemented yet. I have also added required

>>>> target to be little endian.

>>>

>>>

>>>

>>> I am not a maintainer either. Shouldn't these intrinsics be supported

>>> even for big endian?

>>>

>>

>> Yes, they should be implemented, however it is out of scope of this patch.

>>

>>>  From your patch:

>>>

>>>> diff --git

>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>>

>>>

>>> new file mode 100644

>>> index 0000000..c37c72c

>>> --- /dev/null

>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>> @@ -0,0 +1,82 @@

>>> +/* We haven't implemented these intrinsics for arm yet.  */

>>> +/* { dg-xfail-if "" { arm*-*-* } } */

>>> +/* { dg-require-effective-target aarch64_little_endian } */

>>>

>>> According to

>>>

>>> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

>>> this must follow the dg-do directive.

>>>

>>> Also I think the require-effective-target directive will only allow

>>> the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind

>>> of not helpful. Maybe something like:

>>>

>>> /* { dg-require-effective-target aarch64_little_endian { target {

>>> aarch64*-*-* } } } */

>>

>>

>> Ok, then I will restrict this test only for aarch64_little_endian, as

>> that is the purpose of this test.

>>

>>>

>>> So that the check is not performed on arm-*-*-* and the test runs

>>> and fails and then the xfail makes more sense.

>>>

>>> In case the big endian version is also expected to be implemented

>>> in the future, something like:

>>>

>>> /* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */

>>>

>>> or

>>>

>>> /* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */

>>>

>>> would be simpler. (PS: I haven't tested any of these directive myself).

>>>

>>

>> Please find attached updated patch.

>

>

> Thank you for making the edits. One last nit from my side, as I

> mentioned earlier the dg-require-effective directive should go after

> the dg-do directive as mentioned here:

> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

>


Thanks for pointing that out.
Please find attached the updated patch.

> Thanks

> Sudi

>

>

>>

>>>

>>> Thanks

>>> Sudi

>>>

>>>

>>> +/* { dg-do run } */

>>> +/* { dg-options "-O3" } */

>>>

>>>

>>>

>>>> Ok for thrunk?

>>>>

>>>> - Thanks and regards,

>>>>     Sameera D.

>>>>

>>>> gcc/Changelog:

>>>>

>>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>

>>>>

>>>>           * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>           (st1x2): Likewise.

>>>>           (st1x3): Likewise.

>>>>           * config/aarch64/aarch64-simd.md

>>>>           (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>           (aarch64_ld1_x3_<mode>): Likewise

>>>>           (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>           (aarch64_st1_x2_<mode>): Likewise

>>>>           (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>           (aarch64_st1_x3_<mode>): Likewise

>>>>           * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>           (vld1_s8_x3): Likewise.

>>>>           (vld1_u16_x3): Likewise.

>>>>           (vld1_s16_x3): Likewise.

>>>>           (vld1_u32_x3): Likewise.

>>>>           (vld1_s32_x3): Likewise.

>>>>           (vld1_u64_x3): Likewise.

>>>>           (vld1_s64_x3): Likewise.

>>>>           (vld1_f16_x3): Likewise.

>>>>           (vld1_f32_x3): Likewise.

>>>>           (vld1_f64_x3): Likewise.

>>>>           (vld1_p8_x3): Likewise.

>>>>           (vld1_p16_x3): Likewise.

>>>>           (vld1_p64_x3): Likewise.

>>>>           (vld1q_u8_x3): Likewise.

>>>>           (vld1q_s8_x3): Likewise.

>>>>           (vld1q_u16_x3): Likewise.

>>>>           (vld1q_s16_x3): Likewise.

>>>>           (vld1q_u32_x3): Likewise.

>>>>           (vld1q_s32_x3): Likewise.

>>>>           (vld1q_u64_x3): Likewise.

>>>>           (vld1q_s64_x3): Likewise.

>>>>           (vld1q_f16_x3): Likewise.

>>>>           (vld1q_f32_x3): Likewise.

>>>>           (vld1q_f64_x3): Likewise.

>>>>           (vld1q_p8_x3): Likewise.

>>>>           (vld1q_p16_x3): Likewise.

>>>>           (vld1q_p64_x3): Likewise.

>>>>           (vst1_s64_x2): Likewise.

>>>>           (vst1_u64_x2): Likewise.

>>>>           (vst1_f64_x2): Likewise.

>>>>           (vst1_s8_x2): Likewise.

>>>>           (vst1_p8_x2): Likewise.

>>>>           (vst1_s16_x2): Likewise.

>>>>           (vst1_p16_x2): Likewise.

>>>>           (vst1_s32_x2): Likewise.

>>>>           (vst1_u8_x2): Likewise.

>>>>           (vst1_u16_x2): Likewise.

>>>>           (vst1_u32_x2): Likewise.

>>>>           (vst1_f16_x2): Likewise.

>>>>           (vst1_f32_x2): Likewise.

>>>>           (vst1_p64_x2): Likewise.

>>>>           (vst1q_s8_x2): Likewise.

>>>>           (vst1q_p8_x2): Likewise.

>>>>           (vst1q_s16_x2): Likewise.

>>>>           (vst1q_p16_x2): Likewise.

>>>>           (vst1q_s32_x2): Likewise.

>>>>           (vst1q_s64_x2): Likewise.

>>>>           (vst1q_u8_x2): Likewise.

>>>>           (vst1q_u16_x2): Likewise.

>>>>           (vst1q_u32_x2): Likewise.

>>>>           (vst1q_u64_x2): Likewise.

>>>>           (vst1q_f16_x2): Likewise.

>>>>           (vst1q_f32_x2): Likewise.

>>>>           (vst1q_f64_x2): Likewise.

>>>>           (vst1q_p64_x2): Likewise.

>>>>           (vst1_s64_x3): Likewise.

>>>>           (vst1_u64_x3): Likewise.

>>>>           (vst1_f64_x3): Likewise.

>>>>           (vst1_s8_x3): Likewise.

>>>>           (vst1_p8_x3): Likewise.

>>>>           (vst1_s16_x3): Likewise.

>>>>           (vst1_p16_x3): Likewise.

>>>>           (vst1_s32_x3): Likewise.

>>>>           (vst1_u8_x3): Likewise.

>>>>           (vst1_u16_x3): Likewise.

>>>>           (vst1_u32_x3): Likewise.

>>>>           (vst1_f16_x3): Likewise.

>>>>           (vst1_f32_x3): Likewise.

>>>>           (vst1_p64_x3): Likewise.

>>>>           (vst1q_s8_x3): Likewise.

>>>>           (vst1q_p8_x3): Likewise.

>>>>           (vst1q_s16_x3): Likewise.

>>>>           (vst1q_p16_x3): Likewise.

>>>>           (vst1q_s32_x3): Likewise.

>>>>           (vst1q_s64_x3): Likewise.

>>>>           (vst1q_u8_x3): Likewise.

>>>>           (vst1q_u16_x3): Likewise.

>>>>           (vst1q_u32_x3): Likewise.

>>>>           (vst1q_u64_x3): Likewise.

>>>>           (vst1q_f16_x3): Likewise.

>>>>           (vst1q_f32_x3): Likewise.

>>>>           (vst1q_f64_x3): Likewise.

>>>>           (vst1q_p64_x3): Likewise.

>>>>

>>>> gcc/testsuite/Changelog:

>>>>

>>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>

>>>>           * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

>>>> vld1x3 intrinsics for aarch64_little_endian.

>>>>           * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

>>>> vst1x2 intrinsics for aarch64_little_endian.

>>>>           * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

>>>> vst1x3 intrinsics for aarch64_little_endian.

>>>>

>>>

>>

>>

>>

>




-- 
- Thanks and regards,
  Sameera D.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f24..2fd072a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..e197a67 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5047,6 +5047,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI 
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") 
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29a..6ac7099 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 0000000..c37c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vld##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors = vld1##SUFFIX##_x3 (data);		\
+  vst1##SUFFIX (temp, vectors.val[0]);		\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j]) 			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vld##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 0000000..3b6797c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,80 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x2 ()			\
+{						\
+  BASE##_t data[ELTS * 2];			\
+  BASE##_t temp[ELTS * 2];			\
+  BASE##x##ELTS##x##2##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 2; i++)		\
+    data [i] = (BASE##_t) 2*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vst1##SUFFIX##_x2 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 2; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x2 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 0000000..1709115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,81 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vst1##SUFFIX##_x3 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x3 () != 0)	\
+    abort ();
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECK)
+
+  return 0;
+}
+
Sudakshina Das April 11, 2018, 4:19 p.m. | #9
Hi Sameera

On 11/04/18 13:05, Sameera Deshpande wrote:
> On 11 April 2018 at 15:53, Sudakshina Das <sudi.das@arm.com> wrote:

>> Hi Sameera

>>

>>

>> On 11/04/18 09:04, Sameera Deshpande wrote:

>>>

>>> On 10 April 2018 at 20:07, Sudakshina Das <sudi.das@arm.com> wrote:

>>>>

>>>> Hi Sameera

>>>>

>>>>

>>>> On 10/04/18 11:20, Sameera Deshpande wrote:

>>>>>

>>>>>

>>>>> On 7 April 2018 at 01:25, Christophe Lyon <christophe.lyon@linaro.org>

>>>>> wrote:

>>>>>>

>>>>>>

>>>>>> Hi,

>>>>>>

>>>>>> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande

>>>>>> <sameera.deshpande@linaro.org>:

>>>>>>>

>>>>>>>

>>>>>>> Hi Christophe,

>>>>>>>

>>>>>>> Please find attached the updated patch with testcases.

>>>>>>>

>>>>>>> Ok for trunk?

>>>>>>

>>>>>>

>>>>>>

>>>>>> Thanks for the update.

>>>>>>

>>>>>> Since the new intrinsics are only available on aarch64, you want to

>>>>>> prevent the tests from running on arm.

>>>>>> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two

>>>>>> targets.

>>>>>> There are several examples on how to do that in that directory.

>>>>>>

>>>>>> I have also noticed that the tests fail at execution on aarch64_be.

>>>>>>

>>>>>> I didn't look at the patch in details.

>>>>>>

>>>>>> Christophe

>>>>>>

>>>>>>

>>>>>>>

>>>>>>> - Thanks and regards,

>>>>>>>      Sameera D.

>>>>>>>

>>>>>>> 2017-12-14 22:17 GMT+05:30 Christophe Lyon

>>>>>>> <christophe.lyon@linaro.org>:

>>>>>>>>

>>>>>>>>

>>>>>>>> 2017-12-14 9:29 GMT+01:00 Sameera Deshpande

>>>>>>>> <sameera.deshpande@linaro.org>:

>>>>>>>>>

>>>>>>>>>

>>>>>>>>> Hi!

>>>>>>>>>

>>>>>>>>> Please find attached the patch implementing vld1_*_x3, vst1_*_x2 and

>>>>>>>>> vst1_*_x3 intrinsics as defined by Neon document.

>>>>>>>>>

>>>>>>>>> Ok for trunk?

>>>>>>>>>

>>>>>>>>> - Thanks and regards,

>>>>>>>>>      Sameera D.

>>>>>>>>>

>>>>>>>>> gcc/Changelog:

>>>>>>>>>

>>>>>>>>> 2017-11-14  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>>>>>

>>>>>>>>>

>>>>>>>>>            * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>>>>>            (st1x2): Likewise.

>>>>>>>>>            (st1x3): Likewise.

>>>>>>>>>            * config/aarch64/aarch64-simd.md

>>>>>>>>> (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>>>>>            (aarch64_ld1_x3_<mode>): Likewise

>>>>>>>>>            (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>>>>>            (aarch64_st1_x2_<mode>): Likewise

>>>>>>>>>            (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>>>>>            (aarch64_st1_x3_<mode>): Likewise

>>>>>>>>>            * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>>>>>            (vld1_s8_x3): Likewise.

>>>>>>>>>            (vld1_u16_x3): Likewise.

>>>>>>>>>            (vld1_s16_x3): Likewise.

>>>>>>>>>            (vld1_u32_x3): Likewise.

>>>>>>>>>            (vld1_s32_x3): Likewise.

>>>>>>>>>            (vld1_u64_x3): Likewise.

>>>>>>>>>            (vld1_s64_x3): Likewise.

>>>>>>>>>            (vld1_fp16_x3): Likewise.

>>>>>>>>>            (vld1_f32_x3): Likewise.

>>>>>>>>>            (vld1_f64_x3): Likewise.

>>>>>>>>>            (vld1_p8_x3): Likewise.

>>>>>>>>>            (vld1_p16_x3): Likewise.

>>>>>>>>>            (vld1_p64_x3): Likewise.

>>>>>>>>>            (vld1q_u8_x3): Likewise.

>>>>>>>>>            (vld1q_s8_x3): Likewise.

>>>>>>>>>            (vld1q_u16_x3): Likewise.

>>>>>>>>>            (vld1q_s16_x3): Likewise.

>>>>>>>>>            (vld1q_u32_x3): Likewise.

>>>>>>>>>            (vld1q_s32_x3): Likewise.

>>>>>>>>>            (vld1q_u64_x3): Likewise.

>>>>>>>>>            (vld1q_s64_x3): Likewise.

>>>>>>>>>            (vld1q_f16_x3): Likewise.

>>>>>>>>>            (vld1q_f32_x3): Likewise.

>>>>>>>>>            (vld1q_f64_x3): Likewise.

>>>>>>>>>            (vld1q_p8_x3): Likewise.

>>>>>>>>>            (vld1q_p16_x3): Likewise.

>>>>>>>>>            (vld1q_p64_x3): Likewise.

>>>>>>>>>            (vst1_s64_x2): Likewise.

>>>>>>>>>            (vst1_u64_x2): Likewise.

>>>>>>>>>            (vst1_f64_x2):

>>>>>>>>>

>>>>>>>>> Likewise.patchurl=http://people.linaro.org/~christophe.lyon/armv8_2-fp16-scalar-2.patch3

>>>>>>

>>>>>>

>>>>>> patchname=armv8_2-fp16-scalar-2.patch3

>>>>>> refrev=259064

>>>>>> email_to=christophe.lyon@linaro.org

>>>>>>

>>>>>>>>>            (vst1_s8_x2): Likewise.

>>>>>>>>>            (vst1_p8_x2): Likewise.

>>>>>>>>>            (vst1_s16_x2): Likewise.

>>>>>>>>>            (vst1_p16_x2): Likewise.

>>>>>>>>>            (vst1_s32_x2): Likewise.

>>>>>>>>>            (vst1_u8_x2): Likewise.

>>>>>>>>>            (vst1_u16_x2): Likewise.

>>>>>>>>>            (vst1_u32_x2): Likewise.

>>>>>>>>>            (vst1_f16_x2): Likewise.

>>>>>>>>>            (vst1_f32_x2): Likewise.

>>>>>>>>>            (vst1_p64_x2): Likewise.

>>>>>>>>>            (vst1q_s8_x2): Likewise.

>>>>>>>>>            (vst1q_p8_x2): Likewise.

>>>>>>>>>            (vst1q_s16_x2): Likewise.

>>>>>>>>>            (vst1q_p16_x2): Likewise.

>>>>>>>>>            (vst1q_s32_x2): Likewise.

>>>>>>>>>            (vst1q_s64_x2): Likewise.

>>>>>>>>>            (vst1q_u8_x2): Likewise.

>>>>>>>>>            (vst1q_u16_x2): Likewise.

>>>>>>>>>            (vst1q_u32_x2): Likewise.

>>>>>>>>>            (vst1q_u64_x2): Likewise.

>>>>>>>>>            (vst1q_f16_x2): Likewise.

>>>>>>>>>            (vst1q_f32_x2): Likewise.

>>>>>>>>>            (vst1q_f64_x2): Likewise.

>>>>>>>>>            (vst1q_p64_x2): Likewise.

>>>>>>>>>            (vst1_s64_x3): Likewise.

>>>>>>>>>            (vst1_u64_x3): Likewise.

>>>>>>>>>            (vst1_f64_x3): Likewise.

>>>>>>>>>            (vst1_s8_x3): Likewise.

>>>>>>>>>            (vst1_p8_x3): Likewise.

>>>>>>>>>            (vst1_s16_x3): Likewise.

>>>>>>>>>            (vst1_p16_x3): Likewise.

>>>>>>>>>            (vst1_s32_x3): Likewise.

>>>>>>>>>            (vst1_u8_x3): Likewise.

>>>>>>>>>            (vst1_u16_x3): Likewise.

>>>>>>>>>            (vst1_u32_x3): Likewise.

>>>>>>>>>            (vst1_f16_x3): Likewise.

>>>>>>>>>            (vst1_f32_x3): Likewise.

>>>>>>>>>            (vst1_p64_x3): Likewise.

>>>>>>>>>            (vst1q_s8_x3): Likewise.

>>>>>>>>>            (vst1q_p8_x3): Likewise.

>>>>>>>>>            (vst1q_s16_x3): Likewise.

>>>>>>>>>            (vst1q_p16_x3): Likewise.

>>>>>>>>>            (vst1q_s32_x3): Likewise.

>>>>>>>>>            (vst1q_s64_x3): Likewise.

>>>>>>>>>            (vst1q_u8_x3): Likewise.

>>>>>>>>>            (vst1q_u16_x3): Likewise.

>>>>>>>>>            (vst1q_u32_x3): Likewise.

>>>>>>>>>            (vst1q_u64_x3): Likewise.

>>>>>>>>>            (vst1q_f16_x3): Likewise.

>>>>>>>>>            (vst1q_f32_x3): Likewise.

>>>>>>>>>            (vst1q_f64_x3): Likewise.

>>>>>>>>>            (vst1q_p64_x3): Likewise.

>>>>>>>>

>>>>>>>>

>>>>>>>>

>>>>>>>> Hi,

>>>>>>>> I'm not a maintainer, but I suspect you should add some tests.

>>>>>>>>

>>>>>>>> Christophe

>>>>>>>

>>>>>>>

>>>>>>>

>>>>>>>

>>>>>>>

>>>>>>> --

>>>>>>> - Thanks and regards,

>>>>>>>      Sameera D.

>>>>>

>>>>>

>>>>>

>>>>> Hi Christophe,

>>>>>

>>>>> Please find attached the updated patch. Similar to the testcase

>>>>> vld1x2.c, I have updated the testcases to mark them XFAIL for ARM, as

>>>>> the intrinsics are not implemented yet. I have also added required

>>>>> target to be little endian.

>>>>

>>>>

>>>>

>>>> I am not a maintainer either. Shouldn't these intrinsics be supported

>>>> even for big endian?

>>>>

>>>

>>> Yes, they should be implemented, however it is out of scope of this patch.

>>>

>>>>   From your patch:

>>>>

>>>>> diff --git

>>>>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>>>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>>>

>>>>

>>>> new file mode 100644

>>>> index 0000000..c37c72c

>>>> --- /dev/null

>>>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c

>>>> @@ -0,0 +1,82 @@

>>>> +/* We haven't implemented these intrinsics for arm yet.  */

>>>> +/* { dg-xfail-if "" { arm*-*-* } } */

>>>> +/* { dg-require-effective-target aarch64_little_endian } */

>>>>

>>>> According to

>>>>

>>>> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

>>>> this must follow the dg-do directive.

>>>>

>>>> Also I think the require-effective-target directive will only allow

>>>> the test to run on aarch64-*-*-* so the xfail on arm-*-*-* is kind

>>>> of not helpful. Maybe something like:

>>>>

>>>> /* { dg-require-effective-target aarch64_little_endian { target {

>>>> aarch64*-*-* } } } */

>>>

>>>

>>> Ok, then I will restrict this test only for aarch64_little_endian, as

>>> that is the purpose of this test.

>>>

>>>>

>>>> So that the check is not performed on arm-*-*-* and the test runs

>>>> and fails and then the xfail makes more sense.

>>>>

>>>> In case the big endian version is also expected to be implemented

>>>> in the future, something like:

>>>>

>>>> /* { dg-xfail-if "" { arm*-*-* || aarch64_big_endian } } */

>>>>

>>>> or

>>>>

>>>> /* { dg-do run { xfail { arm*-*-* || aarch64_big_endian } } } */

>>>>

>>>> would be simpler. (PS: I haven't tested any of these directive myself).

>>>>

>>>

>>> Please find attached updated patch.

>>

>>

>> Thank you for making the edits. One last nit from my side, as I

>> mentioned earlier the dg-require-effective directive should go after

>> the dg-do directive as mentioned here:

>> https://gcc.gnu.org/onlinedocs/gccint/Directives.html#Skip-a-test-for-some-targets

>>

> 

> Thanks for pointing that out.

> Please find attached the updated patch.


Thank you for updating the patch. The maintainers would be able to 
review the rest of the patch more appropriately.

Thanks
Sudi

> 

>> Thanks

>> Sudi

>>

>>

>>>

>>>>

>>>> Thanks

>>>> Sudi

>>>>

>>>>

>>>> +/* { dg-do run } */

>>>> +/* { dg-options "-O3" } */

>>>>

>>>>

>>>>

>>>>> Ok for thrunk?

>>>>>

>>>>> - Thanks and regards,

>>>>>      Sameera D.

>>>>>

>>>>> gcc/Changelog:

>>>>>

>>>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>

>>>>>

>>>>>            * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>>>>>            (st1x2): Likewise.

>>>>>            (st1x3): Likewise.

>>>>>            * config/aarch64/aarch64-simd.md

>>>>>            (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>>>>>            (aarch64_ld1_x3_<mode>): Likewise

>>>>>            (aarch64_st1x2<VALLDIF:mode>): Likewise

>>>>>            (aarch64_st1_x2_<mode>): Likewise

>>>>>            (aarch64_st1x3<VALLDIF:mode>): Likewise

>>>>>            (aarch64_st1_x3_<mode>): Likewise

>>>>>            * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>>>>>            (vld1_s8_x3): Likewise.

>>>>>            (vld1_u16_x3): Likewise.

>>>>>            (vld1_s16_x3): Likewise.

>>>>>            (vld1_u32_x3): Likewise.

>>>>>            (vld1_s32_x3): Likewise.

>>>>>            (vld1_u64_x3): Likewise.

>>>>>            (vld1_s64_x3): Likewise.

>>>>>            (vld1_f16_x3): Likewise.

>>>>>            (vld1_f32_x3): Likewise.

>>>>>            (vld1_f64_x3): Likewise.

>>>>>            (vld1_p8_x3): Likewise.

>>>>>            (vld1_p16_x3): Likewise.

>>>>>            (vld1_p64_x3): Likewise.

>>>>>            (vld1q_u8_x3): Likewise.

>>>>>            (vld1q_s8_x3): Likewise.

>>>>>            (vld1q_u16_x3): Likewise.

>>>>>            (vld1q_s16_x3): Likewise.

>>>>>            (vld1q_u32_x3): Likewise.

>>>>>            (vld1q_s32_x3): Likewise.

>>>>>            (vld1q_u64_x3): Likewise.

>>>>>            (vld1q_s64_x3): Likewise.

>>>>>            (vld1q_f16_x3): Likewise.

>>>>>            (vld1q_f32_x3): Likewise.

>>>>>            (vld1q_f64_x3): Likewise.

>>>>>            (vld1q_p8_x3): Likewise.

>>>>>            (vld1q_p16_x3): Likewise.

>>>>>            (vld1q_p64_x3): Likewise.

>>>>>            (vst1_s64_x2): Likewise.

>>>>>            (vst1_u64_x2): Likewise.

>>>>>            (vst1_f64_x2): Likewise.

>>>>>            (vst1_s8_x2): Likewise.

>>>>>            (vst1_p8_x2): Likewise.

>>>>>            (vst1_s16_x2): Likewise.

>>>>>            (vst1_p16_x2): Likewise.

>>>>>            (vst1_s32_x2): Likewise.

>>>>>            (vst1_u8_x2): Likewise.

>>>>>            (vst1_u16_x2): Likewise.

>>>>>            (vst1_u32_x2): Likewise.

>>>>>            (vst1_f16_x2): Likewise.

>>>>>            (vst1_f32_x2): Likewise.

>>>>>            (vst1_p64_x2): Likewise.

>>>>>            (vst1q_s8_x2): Likewise.

>>>>>            (vst1q_p8_x2): Likewise.

>>>>>            (vst1q_s16_x2): Likewise.

>>>>>            (vst1q_p16_x2): Likewise.

>>>>>            (vst1q_s32_x2): Likewise.

>>>>>            (vst1q_s64_x2): Likewise.

>>>>>            (vst1q_u8_x2): Likewise.

>>>>>            (vst1q_u16_x2): Likewise.

>>>>>            (vst1q_u32_x2): Likewise.

>>>>>            (vst1q_u64_x2): Likewise.

>>>>>            (vst1q_f16_x2): Likewise.

>>>>>            (vst1q_f32_x2): Likewise.

>>>>>            (vst1q_f64_x2): Likewise.

>>>>>            (vst1q_p64_x2): Likewise.

>>>>>            (vst1_s64_x3): Likewise.

>>>>>            (vst1_u64_x3): Likewise.

>>>>>            (vst1_f64_x3): Likewise.

>>>>>            (vst1_s8_x3): Likewise.

>>>>>            (vst1_p8_x3): Likewise.

>>>>>            (vst1_s16_x3): Likewise.

>>>>>            (vst1_p16_x3): Likewise.

>>>>>            (vst1_s32_x3): Likewise.

>>>>>            (vst1_u8_x3): Likewise.

>>>>>            (vst1_u16_x3): Likewise.

>>>>>            (vst1_u32_x3): Likewise.

>>>>>            (vst1_f16_x3): Likewise.

>>>>>            (vst1_f32_x3): Likewise.

>>>>>            (vst1_p64_x3): Likewise.

>>>>>            (vst1q_s8_x3): Likewise.

>>>>>            (vst1q_p8_x3): Likewise.

>>>>>            (vst1q_s16_x3): Likewise.

>>>>>            (vst1q_p16_x3): Likewise.

>>>>>            (vst1q_s32_x3): Likewise.

>>>>>            (vst1q_s64_x3): Likewise.

>>>>>            (vst1q_u8_x3): Likewise.

>>>>>            (vst1q_u16_x3): Likewise.

>>>>>            (vst1q_u32_x3): Likewise.

>>>>>            (vst1q_u64_x3): Likewise.

>>>>>            (vst1q_f16_x3): Likewise.

>>>>>            (vst1q_f32_x3): Likewise.

>>>>>            (vst1q_f64_x3): Likewise.

>>>>>            (vst1q_p64_x3): Likewise.

>>>>>

>>>>> gcc/testsuite/Changelog:

>>>>>

>>>>> 2018-04-10  Sameera Deshpande  <sameera.deshpande@linaro.org>

>>>>>

>>>>>            * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

>>>>> vld1x3 intrinsics for aarch64_little_endian.

>>>>>            * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

>>>>> vst1x2 intrinsics for aarch64_little_endian.

>>>>>            * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

>>>>> vst1x3 intrinsics for aarch64_little_endian.

>>>>>

>>>>

>>>

>>>

>>>

>>

> 

> 

>
James Greenhalgh April 13, 2018, 2:34 p.m. | #10
On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:
> Hi,

> 

> 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org>:

> > Hi Christophe,

> >

> > Please find attached the updated patch with testcases.

> >

> > Ok for trunk?

> 

> Thanks for the update.

> 

> Since the new intrinsics are only available on aarch64, you want to

> prevent the tests from running on arm.

> Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

> There are several examples on how to do that in that directory.

> 

> I have also noticed that the tests fail at execution on aarch64_be.


I think this is important to fix. We don't want the big-endian target to have
failing implementations of the Neon intrinsics. What is the nature of the
failure?

From what I can see, nothing in the patch prevents using these intrinsics
on big-endian, so either the intrinsics behaviour is wrong (we have a wrong
code bug), or the testcase expected behaviour is wrong.

I don't think disabling the test for big-endian is the right fix. We should
either fix the intrinsics, or fix the testcase.

Thanks,
James
Sameera Deshpande April 13, 2018, 2:39 p.m. | #11
On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <james.greenhalgh@arm.com>
wrote:

> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

> > Hi,

> >

> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <

> sameera.deshpande@linaro.org>:

> > > Hi Christophe,

> > >

> > > Please find attached the updated patch with testcases.

> > >

> > > Ok for trunk?

> >

> > Thanks for the update.

> >

> > Since the new intrinsics are only available on aarch64, you want to

> > prevent the tests from running on arm.

> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two

> targets.

> > There are several examples on how to do that in that directory.

> >

> > I have also noticed that the tests fail at execution on aarch64_be.

>

> I think this is important to fix. We don't want the big-endian target to

> have

> failing implementations of the Neon intrinsics. What is the nature of the

> failure?

>

> From what I can see, nothing in the patch prevents using these intrinsics

> on big-endian, so either the intrinsics behaviour is wrong (we have a wrong

> code bug), or the testcase expected behaviour is wrong.

>

> I don't think disabling the test for big-endian is the right fix. We should

> either fix the intrinsics, or fix the testcase.

>

> Thanks,

> James

>

> Hi James,



As the tests assume the little endian order of elements while checking the
results, the tests are failing for big endian targets. So, the failures are
not because of intrinsic implementations, but because of the testcase.

- Thanks and regards,
  Sameera D.
James Greenhalgh April 13, 2018, 2:51 p.m. | #12
On Fri, Apr 13, 2018 at 03:39:32PM +0100, Sameera Deshpande wrote:
> On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <james.greenhalgh@arm.com<mailto:james.greenhalgh@arm.com>> wrote:

> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

> > Hi,

> >

> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org<mailto:sameera.deshpande@linaro.org>>:

> > > Hi Christophe,

> > >

> > > Please find attached the updated patch with testcases.

> > >

> > > Ok for trunk?

> >

> > Thanks for the update.

> >

> > Since the new intrinsics are only available on aarch64, you want to

> > prevent the tests from running on arm.

> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

> > There are several examples on how to do that in that directory.

> >

> > I have also noticed that the tests fail at execution on aarch64_be.

> 

> I think this is important to fix. We don't want the big-endian target to have

> failing implementations of the Neon intrinsics. What is the nature of the

> failure?

> 

> From what I can see, nothing in the patch prevents using these intrinsics

> on big-endian, so either the intrinsics behaviour is wrong (we have a wrong

> code bug), or the testcase expected behaviour is wrong.

> 

> I don't think disabling the test for big-endian is the right fix. We should

> either fix the intrinsics, or fix the testcase.

> 

> Thanks,

> James

> 

> Hi James,

> 

> As the tests assume the little endian order of elements while checking the

> results, the tests are failing for big endian targets. So, the failures are

> not because of intrinsic implementations, but because of the testcase.


The testcase is a little hard to follow through the macros, but why would
this be the case?

ld1 is deterministic on big and little endian for which elements will be
loaded from memory, as is st1.

My expectation would be that:

  int __attribute__ ((noinline))
  test_vld_u16_x3 ()
  {
    uint16_t data[3 * 3];
    uint16_t temp[3 * 3];
    uint16x4x3_t vectors;
    int i,j;
    for (i = 0; i < 3 * 3; i++)
      data [i] = (uint16_t) 3*i;
    asm volatile ("" : : : "memory");
    vectors = vld1_u16_x3 (data);
    vst1_u16 (temp, vectors.val[0]);
    vst1_u16 (&temp[3], vectors.val[1]);
    vst1_u16 (&temp[3 * 2], vectors.val[2]);
    asm volatile ("" : : : "memory");
    for (j = 0; j < 3 * 3; j++)
      if (temp[j] != data[j])
        return 1;
    return 0;
  }

would work equally well for big- or little-endian.

I think this is more likely to be an intrinsics implementation bug.

Thanks,
James
Sameera Deshpande April 30, 2018, 11:35 p.m. | #13
On 13 April 2018 at 20:21, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Fri, Apr 13, 2018 at 03:39:32PM +0100, Sameera Deshpande wrote:

>> On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <james.greenhalgh@arm.com<mailto:james.greenhalgh@arm.com>> wrote:

>> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

>> > Hi,

>> >

>> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org<mailto:sameera.deshpande@linaro.org>>:

>> > > Hi Christophe,

>> > >

>> > > Please find attached the updated patch with testcases.

>> > >

>> > > Ok for trunk?

>> >

>> > Thanks for the update.

>> >

>> > Since the new intrinsics are only available on aarch64, you want to

>> > prevent the tests from running on arm.

>> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

>> > There are several examples on how to do that in that directory.

>> >

>> > I have also noticed that the tests fail at execution on aarch64_be.

>>

>> I think this is important to fix. We don't want the big-endian target to have

>> failing implementations of the Neon intrinsics. What is the nature of the

>> failure?

>>

>> From what I can see, nothing in the patch prevents using these intrinsics

>> on big-endian, so either the intrinsics behaviour is wrong (we have a wrong

>> code bug), or the testcase expected behaviour is wrong.

>>

>> I don't think disabling the test for big-endian is the right fix. We should

>> either fix the intrinsics, or fix the testcase.

>>

>> Thanks,

>> James

>>

>> Hi James,

>>

>> As the tests assume the little endian order of elements while checking the

>> results, the tests are failing for big endian targets. So, the failures are

>> not because of intrinsic implementations, but because of the testcase.

>

> The testcase is a little hard to follow through the macros, but why would

> this be the case?

>

> ld1 is deterministic on big and little endian for which elements will be

> loaded from memory, as is st1.

>

> My expectation would be that:

>

>   int __attribute__ ((noinline))

>   test_vld_u16_x3 ()

>   {

>     uint16_t data[3 * 3];

>     uint16_t temp[3 * 3];

>     uint16x4x3_t vectors;

>     int i,j;

>     for (i = 0; i < 3 * 3; i++)

>       data [i] = (uint16_t) 3*i;

>     asm volatile ("" : : : "memory");

>     vectors = vld1_u16_x3 (data);

>     vst1_u16 (temp, vectors.val[0]);

>     vst1_u16 (&temp[3], vectors.val[1]);

>     vst1_u16 (&temp[3 * 2], vectors.val[2]);

>     asm volatile ("" : : : "memory");

>     for (j = 0; j < 3 * 3; j++)

>       if (temp[j] != data[j])

>         return 1;

>     return 0;

>   }

>

> would work equally well for big- or little-endian.

>

> I think this is more likely to be an intrinsics implementation bug.

>

> Thanks,

> James

>


Hi James,

Please find attached the updated patch, which now passes for little as
well as big endian.
Ok for trunk?

-- 
- Thanks and regards,
  Sameera D.

gcc/Changelog:

2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>


        * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.
        (st1x2): Likewise.
        (st1x3): Likewise.
        * config/aarch64/aarch64-simd.md
        (aarch64_ld1x3<VALLDIF:mode>): New pattern.
        (aarch64_ld1_x3_<mode>): Likewise
        (aarch64_st1x2<VALLDIF:mode>): Likewise
        (aarch64_st1_x2_<mode>): Likewise
        (aarch64_st1x3<VALLDIF:mode>): Likewise
        (aarch64_st1_x3_<mode>): Likewise
        * config/aarch64/arm_neon.h (vld1_u8_x3): New function.
        (vld1_s8_x3): Likewise.
        (vld1_u16_x3): Likewise.
        (vld1_s16_x3): Likewise.
        (vld1_u32_x3): Likewise.
        (vld1_s32_x3): Likewise.
        (vld1_u64_x3): Likewise.
        (vld1_s64_x3): Likewise.
        (vld1_f16_x3): Likewise.
        (vld1_f32_x3): Likewise.
        (vld1_f64_x3): Likewise.
        (vld1_p8_x3): Likewise.
        (vld1_p16_x3): Likewise.
        (vld1_p64_x3): Likewise.
        (vld1q_u8_x3): Likewise.
        (vld1q_s8_x3): Likewise.
        (vld1q_u16_x3): Likewise.
        (vld1q_s16_x3): Likewise.
        (vld1q_u32_x3): Likewise.
        (vld1q_s32_x3): Likewise.
        (vld1q_u64_x3): Likewise.
        (vld1q_s64_x3): Likewise.
        (vld1q_f16_x3): Likewise.
        (vld1q_f32_x3): Likewise.
        (vld1q_f64_x3): Likewise.
        (vld1q_p8_x3): Likewise.
        (vld1q_p16_x3): Likewise.
        (vld1q_p64_x3): Likewise.
        (vst1_s64_x2): Likewise.
        (vst1_u64_x2): Likewise.
        (vst1_f64_x2): Likewise.
        (vst1_s8_x2): Likewise.
        (vst1_p8_x2): Likewise.
        (vst1_s16_x2): Likewise.
        (vst1_p16_x2): Likewise.
        (vst1_s32_x2): Likewise.
        (vst1_u8_x2): Likewise.
        (vst1_u16_x2): Likewise.
        (vst1_u32_x2): Likewise.
        (vst1_f16_x2): Likewise.
        (vst1_f32_x2): Likewise.
        (vst1_p64_x2): Likewise.
        (vst1q_s8_x2): Likewise.
        (vst1q_p8_x2): Likewise.
        (vst1q_s16_x2): Likewise.
        (vst1q_p16_x2): Likewise.
        (vst1q_s32_x2): Likewise.
        (vst1q_s64_x2): Likewise.
        (vst1q_u8_x2): Likewise.
        (vst1q_u16_x2): Likewise.
        (vst1q_u32_x2): Likewise.
        (vst1q_u64_x2): Likewise.
        (vst1q_f16_x2): Likewise.
        (vst1q_f32_x2): Likewise.
        (vst1q_f64_x2): Likewise.
        (vst1q_p64_x2): Likewise.
        (vst1_s64_x3): Likewise.
        (vst1_u64_x3): Likewise.
        (vst1_f64_x3): Likewise.
        (vst1_s8_x3): Likewise.
        (vst1_p8_x3): Likewise.
        (vst1_s16_x3): Likewise.
        (vst1_p16_x3): Likewise.
        (vst1_s32_x3): Likewise.
        (vst1_u8_x3): Likewise.
        (vst1_u16_x3): Likewise.
        (vst1_u32_x3): Likewise.
        (vst1_f16_x3): Likewise.
        (vst1_f32_x3): Likewise.
        (vst1_p64_x3): Likewise.
        (vst1q_s8_x3): Likewise.
        (vst1q_p8_x3): Likewise.
        (vst1q_s16_x3): Likewise.
        (vst1q_p16_x3): Likewise.
        (vst1q_s32_x3): Likewise.
        (vst1q_s64_x3): Likewise.
        (vst1q_u8_x3): Likewise.
        (vst1q_u16_x3): Likewise.
        (vst1q_u32_x3): Likewise.
        (vst1q_u64_x3): Likewise.
        (vst1q_f16_x3): Likewise.
        (vst1q_f32_x3): Likewise.
        (vst1q_f64_x3): Likewise.
        (vst1q_p64_x3): Likewise.

gcc/testsuite/Changelog:

2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

        * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for
vld1x3 intrinsics for aarch64.
        * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for
vst1x2 intrinsics for aarch64.
        * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for
vst1x3 intrinsics for aarch64.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b383f2485e5..2fd072a5896 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,6 +445,15 @@
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1154fc3d58d..c98d7f23354 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5062,6 +5062,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c45c29ae815..8d318977855 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@ vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0);
+  __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1);
+  __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2);
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27497,6 +27865,706 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
new file mode 100644
index 00000000000..6ddd507d9cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x3.c
@@ -0,0 +1,82 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vld##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors = vld1##SUFFIX##_x3 (data);		\
+  vst1##SUFFIX (temp, vectors.val[0]);		\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);	\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j]) 			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+
+/* Tests of vld1_x3 and vld1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECKS(BASE, ELTS, SUFFIX)	\
+  if (test_vld##SUFFIX##_x3 () != 0)	\
+    fprintf (stderr, "test_vld1##SUFFIX##_x3");
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECKS)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
new file mode 100644
index 00000000000..cb13da0caed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x2.c
@@ -0,0 +1,80 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x2 ()			\
+{						\
+  BASE##_t data[ELTS * 2];			\
+  BASE##_t temp[ELTS * 2];			\
+  BASE##x##ELTS##x##2##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 2; i++)		\
+    data [i] = (BASE##_t) 2*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vst1##SUFFIX##_x2 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 2; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x2 and vst1q_x2.  */
+VARIANTS (TESTMETH)
+
+#define CHECKS(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x2 () != 0)	\
+    fprintf (stderr, "test_vst1##SUFFIX##_x2");
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECKS)
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
new file mode 100644
index 00000000000..3ce272a5007
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x3.c
@@ -0,0 +1,81 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)	\
+int __attribute__ ((noinline))			\
+test_vst1##SUFFIX##_x3 ()			\
+{						\
+  BASE##_t data[ELTS * 3];			\
+  BASE##_t temp[ELTS * 3];			\
+  BASE##x##ELTS##x##3##_t vectors;		\
+  int i,j;					\
+  for (i = 0; i < ELTS * 3; i++)		\
+    data [i] = (BASE##_t) 3*i;		\
+  asm volatile ("" : : : "memory");		\
+  vectors.val[0] = vld1##SUFFIX (data);		\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);	\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vst1##SUFFIX##_x3 (temp, vectors);		\
+  asm volatile ("" : : : "memory");		\
+  for (j = 0; j < ELTS * 3; j++)		\
+    if (temp[j] != data[j])			\
+      return 1;					\
+  return 0;					\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x3 and vst1q_x3.  */
+VARIANTS (TESTMETH)
+
+#define CHECKS(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x3 () != 0)	\
+    fprintf (stderr, "test_vst1##SUFFIX##_x3");
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECKS)
+
+  return 0;
+}
+
Sameera Deshpande May 8, 2018, 10:38 a.m. | #14
On 1 May 2018 at 05:05, Sameera Deshpande <sameera.deshpande@linaro.org> wrote:
> On 13 April 2018 at 20:21, James Greenhalgh <james.greenhalgh@arm.com> wrote:

>> On Fri, Apr 13, 2018 at 03:39:32PM +0100, Sameera Deshpande wrote:

>>> On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <james.greenhalgh@arm.com<mailto:james.greenhalgh@arm.com>> wrote:

>>> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

>>> > Hi,

>>> >

>>> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org<mailto:sameera.deshpande@linaro.org>>:

>>> > > Hi Christophe,

>>> > >

>>> > > Please find attached the updated patch with testcases.

>>> > >

>>> > > Ok for trunk?

>>> >

>>> > Thanks for the update.

>>> >

>>> > Since the new intrinsics are only available on aarch64, you want to

>>> > prevent the tests from running on arm.

>>> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

>>> > There are several examples on how to do that in that directory.

>>> >

>>> > I have also noticed that the tests fail at execution on aarch64_be.

>>>

>>> I think this is important to fix. We don't want the big-endian target to have

>>> failing implementations of the Neon intrinsics. What is the nature of the

>>> failure?

>>>

>>> From what I can see, nothing in the patch prevents using these intrinsics

>>> on big-endian, so either the intrinsics behaviour is wrong (we have a wrong

>>> code bug), or the testcase expected behaviour is wrong.

>>>

>>> I don't think disabling the test for big-endian is the right fix. We should

>>> either fix the intrinsics, or fix the testcase.

>>>

>>> Thanks,

>>> James

>>>

>>> Hi James,

>>>

>>> As the tests assume the little endian order of elements while checking the

>>> results, the tests are failing for big endian targets. So, the failures are

>>> not because of intrinsic implementations, but because of the testcase.

>>

>> The testcase is a little hard to follow through the macros, but why would

>> this be the case?

>>

>> ld1 is deterministic on big and little endian for which elements will be

>> loaded from memory, as is st1.

>>

>> My expectation would be that:

>>

>>   int __attribute__ ((noinline))

>>   test_vld_u16_x3 ()

>>   {

>>     uint16_t data[3 * 3];

>>     uint16_t temp[3 * 3];

>>     uint16x4x3_t vectors;

>>     int i,j;

>>     for (i = 0; i < 3 * 3; i++)

>>       data [i] = (uint16_t) 3*i;

>>     asm volatile ("" : : : "memory");

>>     vectors = vld1_u16_x3 (data);

>>     vst1_u16 (temp, vectors.val[0]);

>>     vst1_u16 (&temp[3], vectors.val[1]);

>>     vst1_u16 (&temp[3 * 2], vectors.val[2]);

>>     asm volatile ("" : : : "memory");

>>     for (j = 0; j < 3 * 3; j++)

>>       if (temp[j] != data[j])

>>         return 1;

>>     return 0;

>>   }

>>

>> would work equally well for big- or little-endian.

>>

>> I think this is more likely to be an intrinsics implementation bug.

>>

>> Thanks,

>> James

>>

>

> Hi James,

>

> Please find attached the updated patch, which now passes for little as

> well as big endian.

> Ok for trunk?

>

> --

> - Thanks and regards,

>   Sameera D.

>

> gcc/Changelog:

>

> 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

>

>

>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>         (st1x2): Likewise.

>         (st1x3): Likewise.

>         * config/aarch64/aarch64-simd.md

>         (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>         (aarch64_ld1_x3_<mode>): Likewise

>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>         (aarch64_st1_x2_<mode>): Likewise

>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>         (aarch64_st1_x3_<mode>): Likewise

>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>         (vld1_s8_x3): Likewise.

>         (vld1_u16_x3): Likewise.

>         (vld1_s16_x3): Likewise.

>         (vld1_u32_x3): Likewise.

>         (vld1_s32_x3): Likewise.

>         (vld1_u64_x3): Likewise.

>         (vld1_s64_x3): Likewise.

>         (vld1_f16_x3): Likewise.

>         (vld1_f32_x3): Likewise.

>         (vld1_f64_x3): Likewise.

>         (vld1_p8_x3): Likewise.

>         (vld1_p16_x3): Likewise.

>         (vld1_p64_x3): Likewise.

>         (vld1q_u8_x3): Likewise.

>         (vld1q_s8_x3): Likewise.

>         (vld1q_u16_x3): Likewise.

>         (vld1q_s16_x3): Likewise.

>         (vld1q_u32_x3): Likewise.

>         (vld1q_s32_x3): Likewise.

>         (vld1q_u64_x3): Likewise.

>         (vld1q_s64_x3): Likewise.

>         (vld1q_f16_x3): Likewise.

>         (vld1q_f32_x3): Likewise.

>         (vld1q_f64_x3): Likewise.

>         (vld1q_p8_x3): Likewise.

>         (vld1q_p16_x3): Likewise.

>         (vld1q_p64_x3): Likewise.

>         (vst1_s64_x2): Likewise.

>         (vst1_u64_x2): Likewise.

>         (vst1_f64_x2): Likewise.

>         (vst1_s8_x2): Likewise.

>         (vst1_p8_x2): Likewise.

>         (vst1_s16_x2): Likewise.

>         (vst1_p16_x2): Likewise.

>         (vst1_s32_x2): Likewise.

>         (vst1_u8_x2): Likewise.

>         (vst1_u16_x2): Likewise.

>         (vst1_u32_x2): Likewise.

>         (vst1_f16_x2): Likewise.

>         (vst1_f32_x2): Likewise.

>         (vst1_p64_x2): Likewise.

>         (vst1q_s8_x2): Likewise.

>         (vst1q_p8_x2): Likewise.

>         (vst1q_s16_x2): Likewise.

>         (vst1q_p16_x2): Likewise.

>         (vst1q_s32_x2): Likewise.

>         (vst1q_s64_x2): Likewise.

>         (vst1q_u8_x2): Likewise.

>         (vst1q_u16_x2): Likewise.

>         (vst1q_u32_x2): Likewise.

>         (vst1q_u64_x2): Likewise.

>         (vst1q_f16_x2): Likewise.

>         (vst1q_f32_x2): Likewise.

>         (vst1q_f64_x2): Likewise.

>         (vst1q_p64_x2): Likewise.

>         (vst1_s64_x3): Likewise.

>         (vst1_u64_x3): Likewise.

>         (vst1_f64_x3): Likewise.

>         (vst1_s8_x3): Likewise.

>         (vst1_p8_x3): Likewise.

>         (vst1_s16_x3): Likewise.

>         (vst1_p16_x3): Likewise.

>         (vst1_s32_x3): Likewise.

>         (vst1_u8_x3): Likewise.

>         (vst1_u16_x3): Likewise.

>         (vst1_u32_x3): Likewise.

>         (vst1_f16_x3): Likewise.

>         (vst1_f32_x3): Likewise.

>         (vst1_p64_x3): Likewise.

>         (vst1q_s8_x3): Likewise.

>         (vst1q_p8_x3): Likewise.

>         (vst1q_s16_x3): Likewise.

>         (vst1q_p16_x3): Likewise.

>         (vst1q_s32_x3): Likewise.

>         (vst1q_s64_x3): Likewise.

>         (vst1q_u8_x3): Likewise.

>         (vst1q_u16_x3): Likewise.

>         (vst1q_u32_x3): Likewise.

>         (vst1q_u64_x3): Likewise.

>         (vst1q_f16_x3): Likewise.

>         (vst1q_f32_x3): Likewise.

>         (vst1q_f64_x3): Likewise.

>         (vst1q_p64_x3): Likewise.

>

> gcc/testsuite/Changelog:

>

> 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

>

>         * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

> vld1x3 intrinsics for aarch64.

>         * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

> vst1x2 intrinsics for aarch64.

>         * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

> vst1x3 intrinsics for aarch64.


Gentle reminder!

-- 
- Thanks and regards,
  Sameera D.
James Greenhalgh May 22, 2018, 3:56 p.m. | #15
On Mon, Apr 30, 2018 at 06:35:11PM -0500, Sameera Deshpande wrote:
> On 13 April 2018 at 20:21, James Greenhalgh <james.greenhalgh@arm.com> wrote:

> > On Fri, Apr 13, 2018 at 03:39:32PM +0100, Sameera Deshpande wrote:

> >> On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <james.greenhalgh@arm.com<mailto:james.greenhalgh@arm.com>> wrote:

> >> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

> >> > Hi,

> >> >

> >> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <sameera.deshpande@linaro.org<mailto:sameera.deshpande@linaro.org>>:

> >> > > Hi Christophe,

> >> > >

> >> > > Please find attached the updated patch with testcases.

> >> > >

> >> > > Ok for trunk?

> >> >

> >> > Thanks for the update.

> >> >

> >> > Since the new intrinsics are only available on aarch64, you want to

> >> > prevent the tests from running on arm.

> >> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the two targets.

> >> > There are several examples on how to do that in that directory.

> >> >

> >> > I have also noticed that the tests fail at execution on aarch64_be.

> >>

> >> I think this is important to fix. We don't want the big-endian target to have

> >> failing implementations of the Neon intrinsics. What is the nature of the

> >> failure?

> >>

> >> From what I can see, nothing in the patch prevents using these intrinsics

> >> on big-endian, so either the intrinsics behaviour is wrong (we have a wrong

> >> code bug), or the testcase expected behaviour is wrong.

> >>

> >> I don't think disabling the test for big-endian is the right fix. We should

> >> either fix the intrinsics, or fix the testcase.

> >>

> >> Thanks,

> >> James

> >>

> >> Hi James,

> >>

> >> As the tests assume the little endian order of elements while checking the

> >> results, the tests are failing for big endian targets. So, the failures are

> >> not because of intrinsic implementations, but because of the testcase.

> >

> > The testcase is a little hard to follow through the macros, but why would

> > this be the case?

> >

> > ld1 is deterministic on big and little endian for which elements will be

> > loaded from memory, as is st1.

> >

> > My expectation would be that:

> >

> >   int __attribute__ ((noinline))

> >   test_vld_u16_x3 ()

> >   {

> >     uint16_t data[3 * 3];

> >     uint16_t temp[3 * 3];

> >     uint16x4x3_t vectors;

> >     int i,j;

> >     for (i = 0; i < 3 * 3; i++)

> >       data [i] = (uint16_t) 3*i;

> >     asm volatile ("" : : : "memory");

> >     vectors = vld1_u16_x3 (data);

> >     vst1_u16 (temp, vectors.val[0]);

> >     vst1_u16 (&temp[3], vectors.val[1]);

> >     vst1_u16 (&temp[3 * 2], vectors.val[2]);

> >     asm volatile ("" : : : "memory");

> >     for (j = 0; j < 3 * 3; j++)

> >       if (temp[j] != data[j])

> >         return 1;

> >     return 0;

> >   }

> >

> > would work equally well for big- or little-endian.

> >

> > I think this is more likely to be an intrinsics implementation bug.

> >

> > Thanks,

> > James

> >

> 

> Hi James,

> 

> Please find attached the updated patch, which now passes for little as

> well as big endian.

> Ok for trunk?



OK.

Thanks,
James

> 

> -- 

> - Thanks and regards,

>   Sameera D.

> 

> gcc/Changelog:

> 

> 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

> 

> 

>         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

>         (st1x2): Likewise.

>         (st1x3): Likewise.

>         * config/aarch64/aarch64-simd.md

>         (aarch64_ld1x3<VALLDIF:mode>): New pattern.

>         (aarch64_ld1_x3_<mode>): Likewise

>         (aarch64_st1x2<VALLDIF:mode>): Likewise

>         (aarch64_st1_x2_<mode>): Likewise

>         (aarch64_st1x3<VALLDIF:mode>): Likewise

>         (aarch64_st1_x3_<mode>): Likewise

>         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

>         (vld1_s8_x3): Likewise.

>         (vld1_u16_x3): Likewise.

>         (vld1_s16_x3): Likewise.

>         (vld1_u32_x3): Likewise.

>         (vld1_s32_x3): Likewise.

>         (vld1_u64_x3): Likewise.

>         (vld1_s64_x3): Likewise.

>         (vld1_f16_x3): Likewise.

>         (vld1_f32_x3): Likewise.

>         (vld1_f64_x3): Likewise.

>         (vld1_p8_x3): Likewise.

>         (vld1_p16_x3): Likewise.

>         (vld1_p64_x3): Likewise.

>         (vld1q_u8_x3): Likewise.

>         (vld1q_s8_x3): Likewise.

>         (vld1q_u16_x3): Likewise.

>         (vld1q_s16_x3): Likewise.

>         (vld1q_u32_x3): Likewise.

>         (vld1q_s32_x3): Likewise.

>         (vld1q_u64_x3): Likewise.

>         (vld1q_s64_x3): Likewise.

>         (vld1q_f16_x3): Likewise.

>         (vld1q_f32_x3): Likewise.

>         (vld1q_f64_x3): Likewise.

>         (vld1q_p8_x3): Likewise.

>         (vld1q_p16_x3): Likewise.

>         (vld1q_p64_x3): Likewise.

>         (vst1_s64_x2): Likewise.

>         (vst1_u64_x2): Likewise.

>         (vst1_f64_x2): Likewise.

>         (vst1_s8_x2): Likewise.

>         (vst1_p8_x2): Likewise.

>         (vst1_s16_x2): Likewise.

>         (vst1_p16_x2): Likewise.

>         (vst1_s32_x2): Likewise.

>         (vst1_u8_x2): Likewise.

>         (vst1_u16_x2): Likewise.

>         (vst1_u32_x2): Likewise.

>         (vst1_f16_x2): Likewise.

>         (vst1_f32_x2): Likewise.

>         (vst1_p64_x2): Likewise.

>         (vst1q_s8_x2): Likewise.

>         (vst1q_p8_x2): Likewise.

>         (vst1q_s16_x2): Likewise.

>         (vst1q_p16_x2): Likewise.

>         (vst1q_s32_x2): Likewise.

>         (vst1q_s64_x2): Likewise.

>         (vst1q_u8_x2): Likewise.

>         (vst1q_u16_x2): Likewise.

>         (vst1q_u32_x2): Likewise.

>         (vst1q_u64_x2): Likewise.

>         (vst1q_f16_x2): Likewise.

>         (vst1q_f32_x2): Likewise.

>         (vst1q_f64_x2): Likewise.

>         (vst1q_p64_x2): Likewise.

>         (vst1_s64_x3): Likewise.

>         (vst1_u64_x3): Likewise.

>         (vst1_f64_x3): Likewise.

>         (vst1_s8_x3): Likewise.

>         (vst1_p8_x3): Likewise.

>         (vst1_s16_x3): Likewise.

>         (vst1_p16_x3): Likewise.

>         (vst1_s32_x3): Likewise.

>         (vst1_u8_x3): Likewise.

>         (vst1_u16_x3): Likewise.

>         (vst1_u32_x3): Likewise.

>         (vst1_f16_x3): Likewise.

>         (vst1_f32_x3): Likewise.

>         (vst1_p64_x3): Likewise.

>         (vst1q_s8_x3): Likewise.

>         (vst1q_p8_x3): Likewise.

>         (vst1q_s16_x3): Likewise.

>         (vst1q_p16_x3): Likewise.

>         (vst1q_s32_x3): Likewise.

>         (vst1q_s64_x3): Likewise.

>         (vst1q_u8_x3): Likewise.

>         (vst1q_u16_x3): Likewise.

>         (vst1q_u32_x3): Likewise.

>         (vst1q_u64_x3): Likewise.

>         (vst1q_f16_x3): Likewise.

>         (vst1q_f32_x3): Likewise.

>         (vst1q_f64_x3): Likewise.

>         (vst1q_p64_x3): Likewise.

> 

> gcc/testsuite/Changelog:

> 

> 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

> 

>         * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

> vld1x3 intrinsics for aarch64.

>         * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

> vst1x2 intrinsics for aarch64.

>         * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

> vst1x3 intrinsics for aarch64.
Sameera Deshpande May 22, 2018, 4:20 p.m. | #16
On Tue 22 May, 2018, 9:26 PM James Greenhalgh, <james.greenhalgh@arm.com>
wrote:

> On Mon, Apr 30, 2018 at 06:35:11PM -0500, Sameera Deshpande wrote:

> > On 13 April 2018 at 20:21, James Greenhalgh <james.greenhalgh@arm.com>

> wrote:

> > > On Fri, Apr 13, 2018 at 03:39:32PM +0100, Sameera Deshpande wrote:

> > >> On Fri 13 Apr, 2018, 8:04 PM James Greenhalgh, <

> james.greenhalgh@arm.com<mailto:james.greenhalgh@arm.com>> wrote:

> > >> On Fri, Apr 06, 2018 at 08:55:47PM +0100, Christophe Lyon wrote:

> > >> > Hi,

> > >> >

> > >> > 2018-04-06 12:15 GMT+02:00 Sameera Deshpande <

> sameera.deshpande@linaro.org<mailto:sameera.deshpande@linaro.org>>:

> > >> > > Hi Christophe,

> > >> > >

> > >> > > Please find attached the updated patch with testcases.

> > >> > >

> > >> > > Ok for trunk?

> > >> >

> > >> > Thanks for the update.

> > >> >

> > >> > Since the new intrinsics are only available on aarch64, you want to

> > >> > prevent the tests from running on arm.

> > >> > Indeed gcc.target/aarch64/advsimd-intrinsics/ is shared between the

> two targets.

> > >> > There are several examples on how to do that in that directory.

> > >> >

> > >> > I have also noticed that the tests fail at execution on aarch64_be.

> > >>

> > >> I think this is important to fix. We don't want the big-endian target

> to have

> > >> failing implementations of the Neon intrinsics. What is the nature of

> the

> > >> failure?

> > >>

> > >> From what I can see, nothing in the patch prevents using these

> intrinsics

> > >> on big-endian, so either the intrinsics behaviour is wrong (we have a

> wrong

> > >> code bug), or the testcase expected behaviour is wrong.

> > >>

> > >> I don't think disabling the test for big-endian is the right fix. We

> should

> > >> either fix the intrinsics, or fix the testcase.

> > >>

> > >> Thanks,

> > >> James

> > >>

> > >> Hi James,

> > >>

> > >> As the tests assume the little endian order of elements while

> checking the

> > >> results, the tests are failing for big endian targets. So, the

> failures are

> > >> not because of intrinsic implementations, but because of the testcase.

> > >

> > > The testcase is a little hard to follow through the macros, but why

> would

> > > this be the case?

> > >

> > > ld1 is deterministic on big and little endian for which elements will

> be

> > > loaded from memory, as is st1.

> > >

> > > My expectation would be that:

> > >

> > >   int __attribute__ ((noinline))

> > >   test_vld_u16_x3 ()

> > >   {

> > >     uint16_t data[3 * 3];

> > >     uint16_t temp[3 * 3];

> > >     uint16x4x3_t vectors;

> > >     int i,j;

> > >     for (i = 0; i < 3 * 3; i++)

> > >       data [i] = (uint16_t) 3*i;

> > >     asm volatile ("" : : : "memory");

> > >     vectors = vld1_u16_x3 (data);

> > >     vst1_u16 (temp, vectors.val[0]);

> > >     vst1_u16 (&temp[3], vectors.val[1]);

> > >     vst1_u16 (&temp[3 * 2], vectors.val[2]);

> > >     asm volatile ("" : : : "memory");

> > >     for (j = 0; j < 3 * 3; j++)

> > >       if (temp[j] != data[j])

> > >         return 1;

> > >     return 0;

> > >   }

> > >

> > > would work equally well for big- or little-endian.

> > >

> > > I think this is more likely to be an intrinsics implementation bug.

> > >

> > > Thanks,

> > > James

> > >

> >

> > Hi James,

> >

> > Please find attached the updated patch, which now passes for little as

> > well as big endian.

> > Ok for trunk?

>

>

> OK.

>

> Thanks,

> James

>

> >

> > --

> > - Thanks and regards,

> >   Sameera D.

> >

> > gcc/Changelog:

> >

> > 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

> >

> >

> >         * config/aarch64/aarch64-simd-builtins.def (ld1x3): New.

> >         (st1x2): Likewise.

> >         (st1x3): Likewise.

> >         * config/aarch64/aarch64-simd.md

> >         (aarch64_ld1x3<VALLDIF:mode>): New pattern.

> >         (aarch64_ld1_x3_<mode>): Likewise

> >         (aarch64_st1x2<VALLDIF:mode>): Likewise

> >         (aarch64_st1_x2_<mode>): Likewise

> >         (aarch64_st1x3<VALLDIF:mode>): Likewise

> >         (aarch64_st1_x3_<mode>): Likewise

> >         * config/aarch64/arm_neon.h (vld1_u8_x3): New function.

> >         (vld1_s8_x3): Likewise.

> >         (vld1_u16_x3): Likewise.

> >         (vld1_s16_x3): Likewise.

> >         (vld1_u32_x3): Likewise.

> >         (vld1_s32_x3): Likewise.

> >         (vld1_u64_x3): Likewise.

> >         (vld1_s64_x3): Likewise.

> >         (vld1_f16_x3): Likewise.

> >         (vld1_f32_x3): Likewise.

> >         (vld1_f64_x3): Likewise.

> >         (vld1_p8_x3): Likewise.

> >         (vld1_p16_x3): Likewise.

> >         (vld1_p64_x3): Likewise.

> >         (vld1q_u8_x3): Likewise.

> >         (vld1q_s8_x3): Likewise.

> >         (vld1q_u16_x3): Likewise.

> >         (vld1q_s16_x3): Likewise.

> >         (vld1q_u32_x3): Likewise.

> >         (vld1q_s32_x3): Likewise.

> >         (vld1q_u64_x3): Likewise.

> >         (vld1q_s64_x3): Likewise.

> >         (vld1q_f16_x3): Likewise.

> >         (vld1q_f32_x3): Likewise.

> >         (vld1q_f64_x3): Likewise.

> >         (vld1q_p8_x3): Likewise.

> >         (vld1q_p16_x3): Likewise.

> >         (vld1q_p64_x3): Likewise.

> >         (vst1_s64_x2): Likewise.

> >         (vst1_u64_x2): Likewise.

> >         (vst1_f64_x2): Likewise.

> >         (vst1_s8_x2): Likewise.

> >         (vst1_p8_x2): Likewise.

> >         (vst1_s16_x2): Likewise.

> >         (vst1_p16_x2): Likewise.

> >         (vst1_s32_x2): Likewise.

> >         (vst1_u8_x2): Likewise.

> >         (vst1_u16_x2): Likewise.

> >         (vst1_u32_x2): Likewise.

> >         (vst1_f16_x2): Likewise.

> >         (vst1_f32_x2): Likewise.

> >         (vst1_p64_x2): Likewise.

> >         (vst1q_s8_x2): Likewise.

> >         (vst1q_p8_x2): Likewise.

> >         (vst1q_s16_x2): Likewise.

> >         (vst1q_p16_x2): Likewise.

> >         (vst1q_s32_x2): Likewise.

> >         (vst1q_s64_x2): Likewise.

> >         (vst1q_u8_x2): Likewise.

> >         (vst1q_u16_x2): Likewise.

> >         (vst1q_u32_x2): Likewise.

> >         (vst1q_u64_x2): Likewise.

> >         (vst1q_f16_x2): Likewise.

> >         (vst1q_f32_x2): Likewise.

> >         (vst1q_f64_x2): Likewise.

> >         (vst1q_p64_x2): Likewise.

> >         (vst1_s64_x3): Likewise.

> >         (vst1_u64_x3): Likewise.

> >         (vst1_f64_x3): Likewise.

> >         (vst1_s8_x3): Likewise.

> >         (vst1_p8_x3): Likewise.

> >         (vst1_s16_x3): Likewise.

> >         (vst1_p16_x3): Likewise.

> >         (vst1_s32_x3): Likewise.

> >         (vst1_u8_x3): Likewise.

> >         (vst1_u16_x3): Likewise.

> >         (vst1_u32_x3): Likewise.

> >         (vst1_f16_x3): Likewise.

> >         (vst1_f32_x3): Likewise.

> >         (vst1_p64_x3): Likewise.

> >         (vst1q_s8_x3): Likewise.

> >         (vst1q_p8_x3): Likewise.

> >         (vst1q_s16_x3): Likewise.

> >         (vst1q_p16_x3): Likewise.

> >         (vst1q_s32_x3): Likewise.

> >         (vst1q_s64_x3): Likewise.

> >         (vst1q_u8_x3): Likewise.

> >         (vst1q_u16_x3): Likewise.

> >         (vst1q_u32_x3): Likewise.

> >         (vst1q_u64_x3): Likewise.

> >         (vst1q_f16_x3): Likewise.

> >         (vst1q_f32_x3): Likewise.

> >         (vst1q_f64_x3): Likewise.

> >         (vst1q_p64_x3): Likewise.

> >

> > gcc/testsuite/Changelog:

> >

> > 2018-05-01  Sameera Deshpande  <sameera.deshpande@linaro.org>

> >

> >         * gcc.target/aarch64/advsimd-intrinsics/vld1x3.c: New test for

> > vld1x3 intrinsics for aarch64.

> >         * gcc.target/aarch64/advsimd-intrinsics/vst1x2.c: New test for

> > vst1x2 intrinsics for aarch64.

> >         * gcc.target/aarch64/advsimd-intrinsics/vst1x3.c: New test for

> > vst1x3 intrinsics for aarch64.

>

> Thanks James!

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 52d01342372..fa623e90017 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -441,6 +441,15 @@ 
   BUILTIN_VALL_F16 (STORE1, st1, 0)
   VAR1(STORE1P, st1, 0, v2di)
 
+  /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+
+  /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+
+  /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4fd34c18f95..852bcf0c16a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5038,6 +5038,70 @@ 
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (unspec:OI
+         [(match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1)
+         (match_operand:OI 1 "register_operand" "w"))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (unspec:CI
+         [(match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+          (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1)
+         (match_operand:CI 1 "register_operand" "w"))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 96e740f91a7..81fed6c852f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17145,6 +17145,374 @@  vld1_u64 (const uint64_t *a)
   return (uint64x1_t) {*a};
 }
 
+/* vld1x3  */
+
+__extension__ extern __inline uint8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x3 (const uint8_t *__a)
+{
+  uint8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x3 (const uint8_t *__a)
+{
+  int8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x3 (const uint16_t *__a)
+{
+  uint16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x3 (const int16_t *__a)
+{
+  int16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x3 (const uint32_t *__a)
+{
+  uint32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x3 (const uint32_t *__a)
+{
+  int32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 0);
+  __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 1);
+  __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x3 (const uint64_t *__a)
+{
+  uint64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = ((uint64x1_t *)__o)[0];
+  __i.val[1] = ((uint64x1_t *)__o)[1];
+  __i.val[2] = ((uint64x1_t *)__o)[2];
+  return __i;
+}
+
+__extension__ extern __inline int64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x3 (const int64_t *__a)
+{
+  int64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = ((int64x1_t *)__o)[0];
+  __i.val[1] = ((int64x1_t *)__o)[1];
+  __i.val[2] = ((int64x1_t *)__o)[2];
+
+  return __i;
+}
+
+__extension__ extern __inline float16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_fp16_x3 (const float16_t *__a)
+{
+  float16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 0);
+  __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 1);
+  __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x3 (const float32_t *__a)
+{
+  float32x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 0);
+  __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 1);
+  __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x3 (const float64_t *__a)
+{
+  float64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = ((float64x1_t *)__o)[0];
+  __i.val[1] = ((float64x1_t *)__o)[1];
+  __i.val[2] = ((float64x1_t *)__o)[2];
+  return __i;
+}
+
+__extension__ extern __inline poly8x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x3 (const poly8_t *__a)
+{
+  poly8x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 0);
+  __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 1);
+  __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x3 (const poly16_t *__a)
+{
+  poly16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 0);
+  __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 1);
+  __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x1x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x3 (const poly64_t *__a)
+{
+  poly64x1x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = ((poly64x1_t *)__o)[0];
+  __i.val[1] = ((poly64x1_t *)__o)[1];
+  __i.val[2] = ((poly64x1_t *)__o)[2];
+
+return __i;
+}
+
+__extension__ extern __inline uint8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x3 (const uint8_t *__a)
+{
+  uint8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x3 (const int8_t *__a)
+{
+  int8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 0);
+  __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 1);
+  __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x3 (const uint16_t *__a)
+{
+  uint16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x3 (const int16_t *__a)
+{
+  int16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x3 (const uint32_t *__a)
+{
+  uint32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x3 (const int32_t *__a)
+{
+  int32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a);
+  __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 0);
+  __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 1);
+  __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline uint64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x3 (const uint64_t *__a)
+{
+  uint64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline int64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x3 (const int64_t *__a)
+{
+  int64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x3 (const float16_t *__a)
+{
+  float16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 0);
+  __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 1);
+  __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float32x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x3 (const float32_t *__a)
+{
+  float32x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 0);
+  __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 1);
+  __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline float64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x3 (const float64_t *__a)
+{
+  float64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a);
+  __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 0);
+  __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 1);
+  __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly8x16x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x3 (const poly8_t *__a)
+{
+  poly8x16x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x3 (const poly16_t *__a)
+{
+  poly16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 0);
+  __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 1);
+  __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline poly64x2x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x3 (const poly64_t *__a)
+{
+  poly64x2x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a);
+  __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 0);
+  __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 1);
+  __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di  (__o, 2);
+  return __i;
+}
+
 /* vld1q */
 
 __extension__ extern __inline float16x8_t
@@ -27161,6 +27529,706 @@  vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
   *__a = __aarch64_vget_lane_any (__b, __lane);
 }
 
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  __builtin_aarch64_st1x2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __builtin_aarch64_st1x2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+/* vst1x3 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t temp;
+  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t temp;
+  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float64x2x3_t temp;
+  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int8x16x3_t temp;
+  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly8x16x3_t temp;
+  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int16x8x3_t temp;
+  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly16x8x3_t temp;
+  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int32x4x3_t temp;
+  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint8x16x3_t temp;
+  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint16x8x3_t temp;
+  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint32x4x3_t temp;
+  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float16x8x3_t temp;
+  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  float32x4x3_t temp;
+  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vstn */
 
 __extension__ extern __inline void