rs6000: Use REAL_TYPE to copy when block move array in structure[PR65421]

Message ID 20200602094150.51336-1-luoxhu@linux.ibm.com
State New
Headers show
Series
  • rs6000: Use REAL_TYPE to copy when block move array in structure[PR65421]
Related show

Commit Message

Kewen.Lin via Gcc-patches June 2, 2020, 9:41 a.m.
Double array in structure as function arguments or return value is accessed
by BLKmode, they are stored to stack and load from stack with redundant
conversion from DF->DI->DF.  This patch checks the homogeneous type and
use the actual element type to do block move to by pass the conversions.

gcc/ChangeLog:

	2020-06-02  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR target/65421
        * config/rs6000/rs6000-string.c (expand_block_move): Use
	elt_mode to copy when homogeneous REAL_TYPE.

gcc/testsuite/ChangeLog:

	2020-06-02  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR target/65421
	* gcc.target/powerpc/pr65421.c: New test.
---
 gcc/config/rs6000/rs6000-string.c          | 15 ++++++++++++++-
 gcc/testsuite/gcc.target/powerpc/pr65421.c | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421.c

-- 
2.21.0.777.g83232e3864

Comments

Kewen.Lin via Gcc-patches June 2, 2020, 10:52 a.m. | #1
On Tue, Jun 2, 2020 at 11:43 AM Xionghu Luo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>

> Double array in structure as function arguments or return value is accessed

> by BLKmode, they are stored to stack and load from stack with redundant

> conversion from DF->DI->DF.  This patch checks the homogeneous type and

> use the actual element type to do block move to by pass the conversions.


Is it correct to do this when the actual data in the place is DImode?
We generally
avoid using any floating point modes here because the DImode data could
for example correspond to a signalling NaN or a non-canonical NaN.

What makes a case with, say, struct { double a; long b; } different?

Richard.

> gcc/ChangeLog:

>

>         2020-06-02  Xionghu Luo  <luoxhu@linux.ibm.com>

>

>         PR target/65421

>         * config/rs6000/rs6000-string.c (expand_block_move): Use

>         elt_mode to copy when homogeneous REAL_TYPE.

>

> gcc/testsuite/ChangeLog:

>

>         2020-06-02  Xionghu Luo  <luoxhu@linux.ibm.com>

>

>         PR target/65421

>         * gcc.target/powerpc/pr65421.c: New test.

> ---

>  gcc/config/rs6000/rs6000-string.c          | 15 ++++++++++++++-

>  gcc/testsuite/gcc.target/powerpc/pr65421.c | 17 +++++++++++++++++

>  2 files changed, 31 insertions(+), 1 deletion(-)

>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421.c

>

> diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c

> index fe7177f10fd..ea217840d88 100644

> --- a/gcc/config/rs6000/rs6000-string.c

> +++ b/gcc/config/rs6000/rs6000-string.c

> @@ -37,6 +37,7 @@

>  #include "target.h"

>  #include "profile-count.h"

>  #include "predict.h"

> +#include "rs6000-internal.h"

>

>  /* Expand a block clear operation, and return 1 if successful.  Return 0

>     if we should let the compiler generate normal code.

> @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)

>    rtx loads[MAX_MOVE_REG];

>    rtx stores[MAX_MOVE_REG];

>    int num_reg = 0;

> +  machine_mode elt_mode = DImode;

>

>    /* If this is not a fixed size move, just call memcpy */

>    if (! constp)

> @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)

>    if (bytes > rs6000_block_move_inline_limit)

>      return 0;

>

> +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));

> +  if (TREE_CODE (type) == RECORD_TYPE

> +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

> +                                               NULL))

> +    {

> +      tree field_type = TREE_TYPE (first_field (type));

> +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

> +         && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

> +       elt_mode = TYPE_MODE (TREE_TYPE (field_type));

> +    }

> +

>    for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)

>      {

>        union {

> @@ -2771,7 +2784,7 @@ expand_block_move (rtx operands[], bool might_overlap)

>                && (align >= 64 || !STRICT_ALIGNMENT))

>         {

>           move_bytes = 8;

> -         mode = DImode;

> +         mode = elt_mode;

>           gen_func.mov = gen_movdi;

>           if (offset == 0 && align < 64)

>             {

> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421.c b/gcc/testsuite/gcc.target/powerpc/pr65421.c

> new file mode 100644

> index 00000000000..ec8f4824de5

> --- /dev/null

> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421.c

> @@ -0,0 +1,17 @@

> +/* { dg-do compile } */

> +/* { dg-options "-O3" } */

> +

> +typedef struct

> +{

> +  double a[4];

> +} A;

> +

> +A

> +foo (const A *a)

> +{

> +  return *a;

> +}

> +

> +/* { dg-final { scan-assembler-not       {\mld\M}    } } */

> +/* { dg-final { scan-assembler-not       {\mstd\M}   } } */

> +/* { dg-final { scan-assembler-times     {\mlfd\M}  4 } } */

> --

> 2.21.0.777.g83232e3864

>
Segher Boessenkool June 2, 2020, 4:49 p.m. | #2
On Tue, Jun 02, 2020 at 12:52:31PM +0200, Richard Biener wrote:
> On Tue, Jun 2, 2020 at 11:43 AM Xionghu Luo via Gcc-patches

> <gcc-patches@gcc.gnu.org> wrote:

> > Double array in structure as function arguments or return value is accessed

> > by BLKmode, they are stored to stack and load from stack with redundant

> > conversion from DF->DI->DF.  This patch checks the homogeneous type and

> > use the actual element type to do block move to by pass the conversions.

> 

> Is it correct to do this when the actual data in the place is DImode?

> We generally

> avoid using any floating point modes here because the DImode data could

> for example correspond to a signalling NaN or a non-canonical NaN.


On PowerPC, load and store insns do not ever trap for FP reasons.  Also,
all possible IEEE binary FP bit patterns have a defined meaning, and on
PowerPC a NaN is never converted to a default NaN.  You can load a DP
(or SP or QP) float and store it again, and get the same bit pattern out
always, and no traps.

It isn't necessarily very fast on all cores though, subnormals can be
very slow on some cores, as the prime (maybe only?) example -- but even
those are not a problem on any existing core AFAIR, not for DP at least.

Things are different if more is done then just moving the data, there
are issues with using SP data in DP insns for example (and OTOH, on
older cores using DP data in SP insns isn't actually supported).

Maybe I am missing something...  I'll look at the patch in detail ;-)


Segher
Segher Boessenkool June 2, 2020, 8:32 p.m. | #3
Hi Xiong Hu,

On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:
> Double array in structure as function arguments or return value is accessed

> by BLKmode, they are stored to stack and load from stack with redundant

> conversion from DF->DI->DF.  This patch checks the homogeneous type and

> use the actual element type to do block move to by pass the conversions.


> @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)

>    rtx loads[MAX_MOVE_REG];

>    rtx stores[MAX_MOVE_REG];

>    int num_reg = 0;

> +  machine_mode elt_mode = DImode;

>  

>    /* If this is not a fixed size move, just call memcpy */

>    if (! constp)

> @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)

>    if (bytes > rs6000_block_move_inline_limit)

>      return 0;

>  

> +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));


Declare elt_mode here as well?

> +  if (TREE_CODE (type) == RECORD_TYPE

> +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

> +						NULL))

> +    {

> +      tree field_type = TREE_TYPE (first_field (type));

> +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

> +	  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

> +	elt_mode = TYPE_MODE (TREE_TYPE (field_type));

> +    }


Homogeneous aggregates only exist in the ELFv2 ABI, while the problem
here is the SP float things.  You also noticed (elsewhere) that if the
struct contains (say) SI, SF, SI, SF, then this does not help.

Is there some better condition this could use, and maybe an expansion
that works in more cases as well?

And, it would be lovely if generic code could expand to something better
already (not expand to a block move at all, certainly not for something
as tiny as this).


Segher
Kewen.Lin via Gcc-patches June 2, 2020, 8:56 p.m. | #4
On Tue, Jun 2, 2020 at 4:32 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>

> Hi Xiong Hu,

>

> On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:

> > Double array in structure as function arguments or return value is accessed

> > by BLKmode, they are stored to stack and load from stack with redundant

> > conversion from DF->DI->DF.  This patch checks the homogeneous type and

> > use the actual element type to do block move to by pass the conversions.

>

> > @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)

> >    rtx loads[MAX_MOVE_REG];

> >    rtx stores[MAX_MOVE_REG];

> >    int num_reg = 0;

> > +  machine_mode elt_mode = DImode;

> >

> >    /* If this is not a fixed size move, just call memcpy */

> >    if (! constp)

> > @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)

> >    if (bytes > rs6000_block_move_inline_limit)

> >      return 0;

> >

> > +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));

>

> Declare elt_mode here as well?

>

> > +  if (TREE_CODE (type) == RECORD_TYPE

> > +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

> > +                                             NULL))

> > +    {

> > +      tree field_type = TREE_TYPE (first_field (type));

> > +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

> > +       && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

> > +     elt_mode = TYPE_MODE (TREE_TYPE (field_type));

> > +    }

>

> Homogeneous aggregates only exist in the ELFv2 ABI, while the problem

> here is the SP float things.  You also noticed (elsewhere) that if the

> struct contains (say) SI, SF, SI, SF, then this does not help.

>

> Is there some better condition this could use, and maybe an expansion

> that works in more cases as well?

>

> And, it would be lovely if generic code could expand to something better

> already (not expand to a block move at all, certainly not for something

> as tiny as this).


And please don't refer to homogeneous aggregates outside of ELFv2 ABI
code because that will miss an optimization or generate incorrect code
other PowerPC OSes and ABIs, such as AIX.

Thanks, David
Segher Boessenkool June 2, 2020, 11:18 p.m. | #5
On Tue, Jun 02, 2020 at 04:56:49PM -0400, David Edelsohn wrote:
> > > +  if (TREE_CODE (type) == RECORD_TYPE

> > > +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

> > > +                                             NULL))

> > > +    {

> > > +      tree field_type = TREE_TYPE (first_field (type));

> > > +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

> > > +       && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

> > > +     elt_mode = TYPE_MODE (TREE_TYPE (field_type));

> > > +    }

> >

> > Homogeneous aggregates only exist in the ELFv2 ABI, while the problem

> > here is the SP float things.  You also noticed (elsewhere) that if the

> > struct contains (say) SI, SF, SI, SF, then this does not help.

> >

> > Is there some better condition this could use, and maybe an expansion

> > that works in more cases as well?

> >

> > And, it would be lovely if generic code could expand to something better

> > already (not expand to a block move at all, certainly not for something

> > as tiny as this).

> 

> And please don't refer to homogeneous aggregates outside of ELFv2 ABI

> code because that will miss an optimization or generate incorrect code

> other PowerPC OSes and ABIs, such as AIX.


Yes, rs6000_discover_homogeneous_aggregate always returns false if some
other ABI is in use, which means for this particular code that the
problem isn't solved for those other ABIs at all.


Segher
Kewen.Lin via Gcc-patches June 8, 2020, 6:22 a.m. | #6
Hi,

On 2020/6/3 04:32, Segher Boessenkool wrote:
> Hi Xiong Hu,

> 

> On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:

>> Double array in structure as function arguments or return value is accessed

>> by BLKmode, they are stored to stack and load from stack with redundant

>> conversion from DF->DI->DF.  This patch checks the homogeneous type and

>> use the actual element type to do block move to by pass the conversions.

> 

>> @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)

>>     rtx loads[MAX_MOVE_REG];

>>     rtx stores[MAX_MOVE_REG];

>>     int num_reg = 0;

>> +  machine_mode elt_mode = DImode;

>>   

>>     /* If this is not a fixed size move, just call memcpy */

>>     if (! constp)

>> @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)

>>     if (bytes > rs6000_block_move_inline_limit)

>>       return 0;

>>   

>> +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));

> 

> Declare elt_mode here as well?

> 

>> +  if (TREE_CODE (type) == RECORD_TYPE

>> +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

>> +						NULL))

>> +    {

>> +      tree field_type = TREE_TYPE (first_field (type));

>> +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

>> +	  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

>> +	elt_mode = TYPE_MODE (TREE_TYPE (field_type));

>> +    }

> 

> Homogeneous aggregates only exist in the ELFv2 ABI, while the problem

> here is the SP float things.  You also noticed (elsewhere) that if the

> struct contains (say) SI, SF, SI, SF, then this does not help.

> 

> Is there some better condition this could use, and maybe an expansion

> that works in more cases as well?

> 

> And, it would be lovely if generic code could expand to something better

> already (not expand to a block move at all, certainly not for something

> as tiny as this).


This pr65421 is array in structure, the assignment is just struct to struct
and won't be split by SROA to element assignment like struct contains <SF, SF>
or <SI, SF>.

pr65421.c.236t.optimized:
foo (const struct A a)
{
  struct A D.2909;

  <bb 2> [local count: 1073741824]:
  D.2909 = a;      // struct to struct.
  return D.2909;
}

pr69143.c.234t.optimized:
blah1 (struct foo1 a)
{
  struct foo1 D.2909;
  float _1;
  float _2;

  <bb 2> [local count: 1073741824]:
  _1 = a.y;
  _2 = a.x;
  D.2909.x = _1;    // element to element.
  D.2909.y = _2;    // element to element.
  return D.2909;
}

So the expander will choose difference path to expand them...

For pr65421, the arguments and return value are accessed by BLKmode after gimplify,
since there is no IPA pass, it is never changed from pass gimple to expand.

In expander, the type conversion only happens on expand_assignment of "D.2909 = a;"
(arguments assigned to local variable, stack to stack, generated by expand_block_move,
insn #13~#20 as followed), the expand_function_start(insn #2~#9) load each element type
to be DF already, DF to DI conversion in insn #13~#20 cause the later RTL passes fail to
do forward propagation in 246r.fwprop1.  So my patch tries to use the actual type for
array in structure here.  If rs6000_discover_homogeneous_aggregate is not allowed to be
used here, how about expose and call rs6000_aggregate_candidate directly?  Not clear why
"Homogeneous aggregates only exist in the ELFv2 ABI" since double array in structure is a
common usage?

pr65421.c.238r.expand:
    1: NOTE_INSN_DELETED
   11: NOTE_INSN_BASIC_BLOCK 2
    2: r121:DF=%1:DF
    3: r122:DF=%2:DF
    4: r123:DF=%3:DF
    5: r124:DF=%4:DF
    6: [r112:DI+0x20]=r121:DF
    7: [r112:DI+0x28]=r122:DF
    8: [r112:DI+0x30]=r123:DF
    9: [r112:DI+0x38]=r124:DF
   10: NOTE_INSN_FUNCTION_BEG
   13: r125:DI=[r112:DI+0x20]
   15: r126:DI=[r112:DI+0x28]
   17: r127:DI=[r112:DI+0x30]
   19: r128:DI=[r112:DI+0x38]
   14: [r112:DI]=r125:DI
   16: [r112:DI+0x8]=r126:DI
   18: [r112:DI+0x10]=r127:DI
   20: [r112:DI+0x18]=r128:DI
   21: r129:DF=[r112:DI]
   22: r130:DF=[r112:DI+0x8]
   23: r131:DF=[r112:DI+0x10]
   24: r132:DF=[r112:DI+0x18]
   25: r117:DF=r129:DF
   26: r118:DF=r130:DF
   27: r119:DF=r131:DF
   28: r120:DF=r132:DF
   32: %1:DF=r117:DF
   33: %2:DF=r118:DF
   34: %3:DF=r119:DF
   35: %4:DF=r120:DF
   36: use %1:DF
   37: use %2:DF
   38: use %3:DF
   39: use %4:DF

To bypass block move requires very generic code change, the BLK mode is determined very early
in gimple, remove BLKmode seems huge project in stor-layout.c\function.c\expr.c etc. and not sure
other targets like it, the ARM64 use OImode register to avoid BLKmode/stack operations, while
X86 expand to two TImode register assignment and pointer result return.

Or do you mean some workaround that don't call emit_block_move to fall in expand_block_move in
rs6000-string.c when expand_assignment of "D.2909 = a;" below?
rtx
store_expr (tree exp, rtx target, int call_param_p,
	    bool nontemporal, bool reverse)
{
...
      else if (GET_CODE (temp) == PARALLEL)
	emit_group_store (target, temp, TREE_TYPE (exp),
			  int_size_in_bytes (TREE_TYPE (exp)));
      else if (GET_MODE (temp) == BLKmode)
	emit_block_move (target, temp, expr_size (exp),
			 (call_param_p
			  ? BLOCK_OP_CALL_PARM : BLOCK_OP_NORMAL))
...
}


Thanks,
Xionghu

> 

> 

> Segher

>
Segher Boessenkool June 9, 2020, 1:42 a.m. | #7
Hi!

On Mon, Jun 08, 2020 at 02:22:23PM +0800, luoxhu wrote:
> On 2020/6/3 04:32, Segher Boessenkool wrote:

> > On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:

> >> +  if (TREE_CODE (type) == RECORD_TYPE

> >> +      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,

> >> +						NULL))

> >> +    {

> >> +      tree field_type = TREE_TYPE (first_field (type));

> >> +      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE

> >> +	  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)

> >> +	elt_mode = TYPE_MODE (TREE_TYPE (field_type));

> >> +    }

> > 

> > Homogeneous aggregates only exist in the ELFv2 ABI, while the problem

> > here is the SP float things.  You also noticed (elsewhere) that if the

> > struct contains (say) SI, SF, SI, SF, then this does not help.

> > 

> > Is there some better condition this could use, and maybe an expansion

> > that works in more cases as well?

> > 

> > And, it would be lovely if generic code could expand to something better

> > already (not expand to a block move at all, certainly not for something

> > as tiny as this).

> 

> This pr65421 is array in structure, the assignment is just struct to struct

> and won't be split by SROA to element assignment like struct contains <SF, SF>

> or <SI, SF>.


Yes, but it is a very small, fixed-length array (4 SP floats in PR65421).
So what we do currently is not very good.

> For pr65421, the arguments and return value are accessed by BLKmode after gimplify,

> since there is no IPA pass, it is never changed from pass gimple to expand.


And that is a problem (not only on Power, but everywhere that likes
registers more than it likes memory...  basically everywhere).

> In expander, the type conversion only happens on expand_assignment of "D.2909 = a;"

> (arguments assigned to local variable, stack to stack, generated by expand_block_move,

> insn #13~#20 as followed), the expand_function_start(insn #2~#9) load each element type

> to be DF already, DF to DI conversion in insn #13~#20 cause the later RTL passes fail to

> do forward propagation in 246r.fwprop1.  So my patch tries to use the actual type for

> array in structure here.  If rs6000_discover_homogeneous_aggregate is not allowed to be

> used here, how about expose and call rs6000_aggregate_candidate directly?


I think it would be best if generic code did not force this into memory
at all.

> Not clear why

> "Homogeneous aggregates only exist in the ELFv2 ABI" since double array in structure is a

> common usage?


Homogeneous arguments are a concept from the ELFv2 ABI.  This macro
return false if not
  (TARGET_HARD_FLOAT
   && DEFAULT_ABI == ABI_ELFv2
   && type && AGGREGATE_TYPE_P (type))

On other ABIs we have the exact same problem, so a good solution would
fix it for all such ABIs (but even for all targets if possible).

> To bypass block move requires very generic code change,


Yes.

> the BLK mode is determined very early

> in gimple, remove BLKmode seems huge project in stor-layout.c\function.c\expr.c etc. and not sure

> other targets like it, the ARM64 use OImode register to avoid BLKmode/stack operations, while

> X86 expand to two TImode register assignment and pointer result return.


To get good code they will have to change it to the proper access mode
in some later pass.

> Or do you mean some workaround that don't call emit_block_move to fall in expand_block_move in

> rs6000-string.c when expand_assignment of "D.2909 = a;" below?

> rtx

> store_expr (tree exp, rtx target, int call_param_p,

> 	    bool nontemporal, bool reverse)

> {

> ...

>       else if (GET_CODE (temp) == PARALLEL)

> 	emit_group_store (target, temp, TREE_TYPE (exp),

> 			  int_size_in_bytes (TREE_TYPE (exp)));

>       else if (GET_MODE (temp) == BLKmode)

> 	emit_block_move (target, temp, expr_size (exp),

> 			 (call_param_p

> 			  ? BLOCK_OP_CALL_PARM : BLOCK_OP_NORMAL))

> ...

> }


We can cetainly do workarounds in the target code if the generic code
cannot be fixed, or other targets are against this for some reason --
but let's try to do the proper thing first?

Thanks,


Segher

Patch

diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
index fe7177f10fd..ea217840d88 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -37,6 +37,7 @@ 
 #include "target.h"
 #include "profile-count.h"
 #include "predict.h"
+#include "rs6000-internal.h"
 
 /* Expand a block clear operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
@@ -2733,6 +2734,7 @@  expand_block_move (rtx operands[], bool might_overlap)
   rtx loads[MAX_MOVE_REG];
   rtx stores[MAX_MOVE_REG];
   int num_reg = 0;
+  machine_mode elt_mode = DImode;
 
   /* If this is not a fixed size move, just call memcpy */
   if (! constp)
@@ -2750,6 +2752,17 @@  expand_block_move (rtx operands[], bool might_overlap)
   if (bytes > rs6000_block_move_inline_limit)
     return 0;
 
+  tree type = TREE_TYPE (MEM_EXPR (orig_dest));
+  if (TREE_CODE (type) == RECORD_TYPE
+      && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, NULL,
+						NULL))
+    {
+      tree field_type = TREE_TYPE (first_field (type));
+      if (field_type && TREE_CODE (field_type) == ARRAY_TYPE
+	  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)
+	elt_mode = TYPE_MODE (TREE_TYPE (field_type));
+    }
+
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
     {
       union {
@@ -2771,7 +2784,7 @@  expand_block_move (rtx operands[], bool might_overlap)
 	       && (align >= 64 || !STRICT_ALIGNMENT))
 	{
 	  move_bytes = 8;
-	  mode = DImode;
+	  mode = elt_mode;
 	  gen_func.mov = gen_movdi;
 	  if (offset == 0 && align < 64)
 	    {
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421.c b/gcc/testsuite/gcc.target/powerpc/pr65421.c
new file mode 100644
index 00000000000..ec8f4824de5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+typedef struct
+{
+  double a[4];
+} A;
+
+A
+foo (const A *a)
+{
+  return *a;
+}
+
+/* { dg-final { scan-assembler-not       {\mld\M}    } } */
+/* { dg-final { scan-assembler-not       {\mstd\M}   } } */
+/* { dg-final { scan-assembler-times     {\mlfd\M}  4 } } */