[09/10] Experiment with using optinfo in gimple-loop-interchange.cc

Message ID 1527626483-4723-10-git-send-email-dmalcolm@redhat.com
State New
Headers show
Series
  • RFC: Prototype of compiler-assisted performance analysis
Related show

Commit Message

David Malcolm May 29, 2018, 8:41 p.m.
This was an experiment to try to capture information on a
loop optimization.

gcc/ChangeLog:
	* gimple-loop-interchange.cc (should_interchange_loops): Add
	optinfo note when interchange gives better data locality behavior.
	(tree_loop_interchange::interchange): Add OPTINFO_SCOPE.
	Add optinfo for successful and unsuccessful interchanges.
	(prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add
	optinfo note.
	(pass_linterchange::execute): Add OPTINFO_SCOPE.
---
 gcc/gimple-loop-interchange.cc | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

-- 
1.8.5.3

Comments

Richard Biener June 1, 2018, 9:50 a.m. | #1
On Tue, May 29, 2018 at 10:33 PM David Malcolm <dmalcolm@redhat.com> wrote:
>

> This was an experiment to try to capture information on a

> loop optimization.

>

> gcc/ChangeLog:

>         * gimple-loop-interchange.cc (should_interchange_loops): Add

>         optinfo note when interchange gives better data locality behavior.

>         (tree_loop_interchange::interchange): Add OPTINFO_SCOPE.

>         Add optinfo for successful and unsuccessful interchanges.

>         (prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add

>         optinfo note.

>         (pass_linterchange::execute): Add OPTINFO_SCOPE.

> ---

>  gcc/gimple-loop-interchange.cc | 36 +++++++++++++++++++++++++++++++++++-

>  1 file changed, 35 insertions(+), 1 deletion(-)

>

> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc

> index eb35263..cd32288 100644

> --- a/gcc/gimple-loop-interchange.cc

> +++ b/gcc/gimple-loop-interchange.cc

> @@ -1556,7 +1556,19 @@ should_interchange_loops (unsigned i_idx, unsigned o_idx,

>    ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;

>    /* Do interchange if it gives better data locality behavior.  */

>    if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))

> -    return true;

> +    {

> +      if (optinfo_enabled_p ())

> +       OPTINFO_NOTE ((gimple *)NULL) // FIXME

> +         << "interchange gives better data locality behavior: "

> +         << "iloop_strides: "

> +         << decu (iloop_strides)

> +         << " > (oloop_strides: "

> +         << decu (oloop_strides)

> +         << " * ratio: "

> +         << decu (ratio)

> +         << ")";


Just randomly inside the thread.

NOOOOOOOOOO!

:/

Please do _not_ add more stream-like APIs.  How do you expect
translators to deal with those?

Yes, I'm aware of the graphite-* ones and I dislike those very much.

What's wrong with the existing dump API?

Richard.

> +      return true;

> +    }

>    if (wi::gtu_p (iloop_strides, oloop_strides))

>      {

>        /* Or it creates more invariant memory references.  */

> @@ -1578,6 +1590,8 @@ bool

>  tree_loop_interchange::interchange (vec<data_reference_p> datarefs,

>                                     vec<ddr_p> ddrs)

>  {

> +  OPTINFO_SCOPE ("tree_loop_interchange::interchange", m_loop_nest[0]);

> +

>    location_t loc = find_loop_location (m_loop_nest[0]);

>    bool changed_p = false;

>    /* In each iteration we try to interchange I-th loop with (I+1)-th loop.

> @@ -1628,6 +1642,10 @@ tree_loop_interchange::interchange (vec<data_reference_p> datarefs,

>             fprintf (dump_file,

>                      "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",

>                      oloop.m_loop->num, iloop.m_loop->num);

> +         if (optinfo_enabled_p ())

> +           OPTINFO_SUCCESS (oloop.m_loop)

> +             << optinfo_printf ("Loop_pair<outer:%d, inner:%d> is interchanged",

> +                                oloop.m_loop->num, iloop.m_loop->num);

>

>           changed_p = true;

>           interchange_loops (iloop, oloop);

> @@ -1641,6 +1659,10 @@ tree_loop_interchange::interchange (vec<data_reference_p> datarefs,

>             fprintf (dump_file,

>                      "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",

>                      oloop.m_loop->num, iloop.m_loop->num);

> +         if (optinfo_enabled_p ())

> +           OPTINFO_FAILURE (oloop.m_loop)

> +             << optinfo_printf ("Loop_pair<outer:%d, inner:%d> is not interchanged",

> +                                oloop.m_loop->num, iloop.m_loop->num);

>         }

>      }

>    simple_dce_from_worklist (m_dce_seeds);

> @@ -1648,6 +1670,9 @@ tree_loop_interchange::interchange (vec<data_reference_p> datarefs,

>    if (changed_p)

>      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,

>                      "loops interchanged in loop nest\n");

> +  if (optinfo_enabled_p ())

> +    OPTINFO_SUCCESS (m_loop_nest[0])

> +      << "loops interchanged in loop nest";

>

>    return changed_p;

>  }

> @@ -1971,6 +1996,8 @@ static bool

>  prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,

>                            vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)

>  {

> +  OPTINFO_SCOPE ("prepare_perfect_loop_nest", loop);

> +

>    struct loop *start_loop = NULL, *innermost = loop;

>    struct loop *outermost = loops_for_fn (cfun)->tree_root;

>

> @@ -2029,6 +2056,12 @@ prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,

>           fprintf (dump_file,

>                    "\nConsider loop interchange for loop_nest<%d - %d>\n",

>                    start_loop->num, innermost->num);

> +       if (optinfo_enabled_p ())

> +         {

> +           OPTINFO_NOTE (start_loop)

> +             << optinfo_printf ("consider loop interchange for loop_nest<%d - %d>",

> +                                start_loop->num, innermost->num);

> +         }

>

>         if (loop != start_loop)

>           prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);

> @@ -2061,6 +2094,7 @@ pass_linterchange::execute (function *fun)

>    struct loop *loop;

>    FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)

>      {

> +      OPTINFO_SCOPE ("considering loop for interchange", loop);

>        vec<loop_p> loop_nest = vNULL;

>        vec<data_reference_p> datarefs = vNULL;

>        vec<ddr_p> ddrs = vNULL;

> --

> 1.8.5.3

>
David Malcolm June 1, 2018, 1:40 p.m. | #2
On Fri, 2018-06-01 at 11:50 +0200, Richard Biener wrote:
> On Tue, May 29, 2018 at 10:33 PM David Malcolm <dmalcolm@redhat.com>

> wrote:

> > 

> > This was an experiment to try to capture information on a

> > loop optimization.

> > 

> > gcc/ChangeLog:

> >         * gimple-loop-interchange.cc (should_interchange_loops):

> > Add

> >         optinfo note when interchange gives better data locality

> > behavior.

> >         (tree_loop_interchange::interchange): Add OPTINFO_SCOPE.

> >         Add optinfo for successful and unsuccessful interchanges.

> >         (prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add

> >         optinfo note.

> >         (pass_linterchange::execute): Add OPTINFO_SCOPE.

> > ---

> >  gcc/gimple-loop-interchange.cc | 36

> > +++++++++++++++++++++++++++++++++++-

> >  1 file changed, 35 insertions(+), 1 deletion(-)

> > 

> > diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-

> > interchange.cc

> > index eb35263..cd32288 100644

> > --- a/gcc/gimple-loop-interchange.cc

> > +++ b/gcc/gimple-loop-interchange.cc

> > @@ -1556,7 +1556,19 @@ should_interchange_loops (unsigned i_idx,

> > unsigned o_idx,

> >    ratio = innermost_loops_p ? INNER_STRIDE_RATIO :

> > OUTER_STRIDE_RATIO;

> >    /* Do interchange if it gives better data locality behavior.  */

> >    if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))

> > -    return true;

> > +    {

> > +      if (optinfo_enabled_p ())

> > +       OPTINFO_NOTE ((gimple *)NULL) // FIXME

> > +         << "interchange gives better data locality behavior: "

> > +         << "iloop_strides: "

> > +         << decu (iloop_strides)

> > +         << " > (oloop_strides: "

> > +         << decu (oloop_strides)

> > +         << " * ratio: "

> > +         << decu (ratio)

> > +         << ")";

> 

> Just randomly inside the thread.

> 

> NOOOOOOOOOO!

> 

> :/


> Please do _not_ add more stream-like APIs.  How do you expect

> translators to deal with those?

> 

> Yes, I'm aware of the graphite-* ones and I dislike those very much.

> 

> What's wrong with the existing dump API?


The existing API suffers from a "wall of text" problem:

* although it's possible to filter based on various criteria (the
optgroup tags, specific passes, success vs failure), it's not possible
to filter base on code hotness: the -fopt-info API works purely in
terms of location_t.  So all of the messages about the hottest
functions in the workload are intermingled with all of the other
messages about all of the other functions.

* some of the text notes refer to function entry, but all of these are
emitted "at the same level": there's no way to see the nesting of these
function-entry logs, and where other notes are in relation to them. 
For example, in:

  test.c:8:3: note: === analyzing loop ===
  test.c:8:3: note: === analyze_loop_nest ===
  test.c:8:3: note: === vect_analyze_loop_form ===
  test.c:8:3: note: === get_loop_niters ===
  test.c:8:3: note: symbolic number of iterations is (unsigned int) n_9(D)
  test.c:8:3: note: not vectorized: loop contains function calls or data references that cannot be analyzed
  test.c:8:3: note: vectorized 0 loops in function

there's no way to tell that the "vect_analyze_loop_form" is in fact
inside the call to "analyze_loop_nest", and where the "symbolic number
of iterations" messages is coming from in relation to them.  This may
not seem significant here, but I'm quoting a small example;
vectorization typically leads to dozens of messages, with a deep
nesting structure (where that structure isn't visible in the -fopt-info 
output).

The existing API is throwing data away:

* as noted above, by working purely with a location_t, the execution
count isn't associated with the messages.  The output format purely
gives file/line/column information, but doesn't cover the inlining
chain.   For C++ templates it doesn't specify which instance of a
template is being optimized.

* there's no metadata about where the messages are coming from.  It's
easy to get at the current pass internally, but the messages don't
capture that.  Figuring out where a message came from requires grepping
the GCC source code.  The prototype I posted captures the __FILE__ and
__LINE__ within the gcc source for every message emitted, and which
pass instance emitted it.

* The current output format is of the form:
     "FILE:LINE:COLUMN: free-form text\n"
This is only machine-readable up to a point: if a program is parsing
it, all it has is the free-form text.  The prototype I posted captures
what kinds of things are in the text (statements, trees, symtab_nodes),
and captures location information for them, so that visualizations of
the dumps can provide useful links.

There's no API-level grouping of messages, beyond looking for newline
characters.

I'm probably re-hashing a lot of the material in the cover letter here:
"[PATCH 00/10] RFC: Prototype of compiler-assisted performance analysis"
  https://gcc.gnu.org/ml/gcc-patches/2018-05/msg01675.html


I'd like to provide a machine-readable output format that covers the
above - in particular the profile data (whilst retaining -fopt-info for
compatibility).  Separation of the data from its presentation.

Clearly you don't like the stream-like API from the prototype :)

So I'm wondering what the API ought to look like, one that would allow
for the kind of "richer" machine-readable output.

Consider this "-fopt-info" code (from "vect_create_data_ref_ptr"; this
is example 2 from the cover-letter):

  if (dump_enabled_p ())
     {
       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
       dump_printf_loc (MSG_NOTE, vect_location,
                        "create %s-pointer variable to type: ",
                        get_tree_code_name (TREE_CODE (aggr_type)));
       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
       else
         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
       dump_printf (MSG_NOTE, "\n");
     }

where the information is built up piecewise, with conditional logic.
(This existing code isn't particularly amenable to translation either).

One option would be for "vect_location" to become something other than
a location_t, from which a execution count can be gained (e.g. a gimple
*, from which the location_t and the BB can be accessed).  Then
"dump_printf_loc" would signify starting an optimization record, and
the final "\n" would signify the end of an optimization record; the
various dump_generic_expr and dump_printf would add structured
information and text entries to theoptimization record.

This has the advantage of retaining the look of the existing API (so
the existing callsites don't need changing), but it changes their
meaning, so that as well as writing to -fopt-info's destinations, it's
also in a sense parsing the dump_* calls and building optimization
records from them.

AIUI, dump_printf_loc can't be a macro, as it's variadic, so this
approach doesn't allow for capturing the location within GCC for where
the message is emitted.

Another approach: there could be something like:

  if (optinfo_enabled_p ())
    {
       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
       OPTINFO_VECT_NOTE note;
       note.printf ("create %s-pointer variable to type: ",
                    get_tree_code_name (TREE_CODE (aggr_type)));
       note.add_slim (aggr_type);
       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
         note.printf ("  vectorizing an array ref: ");
       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
         note.printf (MSG_NOTE, "  vectorizing a vector ref: ");
       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
         note.printf (MSG_NOTE, "  vectorizing a record based array ref: ");
       else
         note.printf (MSG_NOTE, "  vectorizing a pointer ref: ");
       note.add_slim (DR_BASE_OBJECT (dr));
    }

which uses a macro to get the gcc source location, and avoids the
special meaning for "\n".  This avoids the "<<" - but is kind of just
different syntax for the same - it's hard with this example to avoid
it.

Yet another approach: reworking things to support i18n via a pure
printf-style interface, using, say "%S" to mean "TDF_SLIM", it could be
like:

  if (optinfo_enabled_p ())
    {
       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"
                            "  vectorizing an array ref: %S",
                            get_tree_code_name (TREE_CODE (aggr_type))
	                    aggr_type,
                            DR_BASE_OBJECT (dr));
      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"
                            "  vectorizing a vector ref: %S",
                            get_tree_code_name (TREE_CODE (aggr_type))
	                    aggr_type,
                            DR_BASE_OBJECT (dr));
      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"
                            "  vectorizing a record based array ref: %S",
                            get_tree_code_name (TREE_CODE (aggr_type))
	                    aggr_type,
                            DR_BASE_OBJECT (dr));
      else
         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"
                            "  vectorizing a pointer ref: %S",
                            get_tree_code_name (TREE_CODE (aggr_type))
	                    aggr_type,
                            DR_BASE_OBJECT (dr));
    }

or somesuch.  The %S would allow for the data to be captured when
emitting optimization records for the messages (not just plain text,
e.g. locations of the expressions).

So those are some ideas.  I'm sure there are other ways to fix the
issues with -fopt-info; I'm brainstorming here.

[...snip...]

Thoughts?

Thanks
Dave
Richard Biener June 1, 2018, 3:31 p.m. | #3
On June 1, 2018 3:40:15 PM GMT+02:00, David Malcolm <dmalcolm@redhat.com> wrote:
>On Fri, 2018-06-01 at 11:50 +0200, Richard Biener wrote:

>> On Tue, May 29, 2018 at 10:33 PM David Malcolm <dmalcolm@redhat.com>

>> wrote:

>> > 

>> > This was an experiment to try to capture information on a

>> > loop optimization.

>> > 

>> > gcc/ChangeLog:

>> >         * gimple-loop-interchange.cc (should_interchange_loops):

>> > Add

>> >         optinfo note when interchange gives better data locality

>> > behavior.

>> >         (tree_loop_interchange::interchange): Add OPTINFO_SCOPE.

>> >         Add optinfo for successful and unsuccessful interchanges.

>> >         (prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add

>> >         optinfo note.

>> >         (pass_linterchange::execute): Add OPTINFO_SCOPE.

>> > ---

>> >  gcc/gimple-loop-interchange.cc | 36

>> > +++++++++++++++++++++++++++++++++++-

>> >  1 file changed, 35 insertions(+), 1 deletion(-)

>> > 

>> > diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-

>> > interchange.cc

>> > index eb35263..cd32288 100644

>> > --- a/gcc/gimple-loop-interchange.cc

>> > +++ b/gcc/gimple-loop-interchange.cc

>> > @@ -1556,7 +1556,19 @@ should_interchange_loops (unsigned i_idx,

>> > unsigned o_idx,

>> >    ratio = innermost_loops_p ? INNER_STRIDE_RATIO :

>> > OUTER_STRIDE_RATIO;

>> >    /* Do interchange if it gives better data locality behavior.  */

>> >    if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))

>> > -    return true;

>> > +    {

>> > +      if (optinfo_enabled_p ())

>> > +       OPTINFO_NOTE ((gimple *)NULL) // FIXME

>> > +         << "interchange gives better data locality behavior: "

>> > +         << "iloop_strides: "

>> > +         << decu (iloop_strides)

>> > +         << " > (oloop_strides: "

>> > +         << decu (oloop_strides)

>> > +         << " * ratio: "

>> > +         << decu (ratio)

>> > +         << ")";

>> 

>> Just randomly inside the thread.

>> 

>> NOOOOOOOOOO!

>> 

>> :/

>

>> Please do _not_ add more stream-like APIs.  How do you expect

>> translators to deal with those?

>> 

>> Yes, I'm aware of the graphite-* ones and I dislike those very much.

>> 

>> What's wrong with the existing dump API?

>

>The existing API suffers from a "wall of text" problem:

>

>* although it's possible to filter based on various criteria (the

>optgroup tags, specific passes, success vs failure), it's not possible

>to filter base on code hotness: the -fopt-info API works purely in

>terms of location_t.  So all of the messages about the hottest

>functions in the workload are intermingled with all of the other

>messages about all of the other functions.


True

>* some of the text notes refer to function entry, but all of these are

>emitted "at the same level": there's no way to see the nesting of these

>function-entry logs, and where other notes are in relation to them. 

>For example, in:

>

>  test.c:8:3: note: === analyzing loop ===

>  test.c:8:3: note: === analyze_loop_nest ===

>  test.c:8:3: note: === vect_analyze_loop_form ===

>  test.c:8:3: note: === get_loop_niters ===

>test.c:8:3: note: symbolic number of iterations is (unsigned int)

>n_9(D)

>test.c:8:3: note: not vectorized: loop contains function calls or data

>references that cannot be analyzed

>  test.c:8:3: note: vectorized 0 loops in function

>

>there's no way to tell that the "vect_analyze_loop_form" is in fact

>inside the call to "analyze_loop_nest", and where the "symbolic number

>of iterations" messages is coming from in relation to them.  This may

>not seem significant here, but I'm quoting a small example;

>vectorization typically leads to dozens of messages, with a deep

>nesting structure (where that structure isn't visible in the -fopt-info

>

>output).


True. The same issue exists for diagnostics BTW. Indeed, being able to collapse 'sections' in dump files, opt-info or diagnostics sounds useful. 

Note that for dump files and opt-info the level argument was sort of designed to do that. 

>

>The existing API is throwing data away:

>

>* as noted above, by working purely with a location_t, the execution

>count isn't associated with the messages.  The output format purely

>gives file/line/column information, but doesn't cover the inlining

>chain.   For C++ templates it doesn't specify which instance of a

>template is being optimized.


It might be useful to enhance the interface by allowing different kind of 'locations'. 

>* there's no metadata about where the messages are coming from.  It's

>easy to get at the current pass internally, but the messages don't

>capture that.  Figuring out where a message came from requires grepping

>the GCC source code.  The prototype I posted captures the __FILE__ and

>__LINE__ within the gcc source for every message emitted, and which

>pass instance emitted it.


The opt info group was supposed to captures this to the level interesting for a user. 

>* The current output format is of the form:

>     "FILE:LINE:COLUMN: free-form text\n"

>This is only machine-readable up to a point: if a program is parsing

>it, all it has is the free-form text.  The prototype I posted captures

>what kinds of things are in the text (statements, trees, symtab_nodes),

>and captures location information for them, so that visualizations of

>the dumps can provide useful links.

>

>There's no API-level grouping of messages, beyond looking for newline

>characters.

>

>I'm probably re-hashing a lot of the material in the cover letter here:

>"[PATCH 00/10] RFC: Prototype of compiler-assisted performance

>analysis"

>  https://gcc.gnu.org/ml/gcc-patches/2018-05/msg01675.html

>

>

>I'd like to provide a machine-readable output format that covers the

>above - in particular the profile data (whilst retaining -fopt-info for

>compatibility).  Separation of the data from its presentation.

>

>Clearly you don't like the stream-like API from the prototype :)


Yes :) I wasn't so much complaining about the content but the presentation /API. 

>So I'm wondering what the API ought to look like, one that would allow

>for the kind of "richer" machine-readable output.

>

>Consider this "-fopt-info" code (from "vect_create_data_ref_ptr"; this

>is example 2 from the cover-letter):

>

>  if (dump_enabled_p ())

>     {

>       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

>       dump_printf_loc (MSG_NOTE, vect_location,

>                        "create %s-pointer variable to type: ",

>                        get_tree_code_name (TREE_CODE (aggr_type)));

>       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);

>       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

>         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");

>       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

>         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");

>       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

>    dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");

>       else

>         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");

>       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));

>       dump_printf (MSG_NOTE, "\n");

>     }

>

>where the information is built up piecewise, with conditional logic.

>(This existing code isn't particularly amenable to translation either).

>

>One option would be for "vect_location" to become something other than

>a location_t, from which a execution count can be gained (e.g. a gimple

>*, from which the location_t and the BB can be accessed).


Yes. Like a container that can be initiated from other kind of contexts. 

  Then
>"dump_printf_loc" would signify starting an optimization record, and

>the final "\n" would signify the end of an optimization record; the

>various dump_generic_expr and dump_printf would add structured

>information and text entries to theoptimization record.


A push/pop style API would maybe work as well. (pushing a level with some meta data) 

>This has the advantage of retaining the look of the existing API (so

>the existing callsites don't need changing), but it changes their

>meaning, so that as well as writing to -fopt-info's destinations, it's

>also in a sense parsing the dump_* calls and building optimization

>records from them.

>

>AIUI, dump_printf_loc can't be a macro, as it's variadic, so this

>approach doesn't allow for capturing the location within GCC for where

>the message is emitted.


True, though we have __builtin_FILE and friends that can be used as default args. 

Note a push/pop level API can also buffer intermediate printf style output and annotate/indent it (supporting a dump file emacs/vim mode that can do collapse and expand) 

>Another approach: there could be something like:

>

>  if (optinfo_enabled_p ())

>    {

>       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

>       OPTINFO_VECT_NOTE note;

>       note.printf ("create %s-pointer variable to type: ",

>                    get_tree_code_name (TREE_CODE (aggr_type)));

>       note.add_slim (aggr_type);

>       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

>         note.printf ("  vectorizing an array ref: ");

>       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

>         note.printf (MSG_NOTE, "  vectorizing a vector ref: ");

>       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

>    note.printf (MSG_NOTE, "  vectorizing a record based array ref: ");

>       else

>         note.printf (MSG_NOTE, "  vectorizing a pointer ref: ");

>       note.add_slim (DR_BASE_OBJECT (dr));

>    }

>

>which uses a macro to get the gcc source location, and avoids the

>special meaning for "\n".  This avoids the "<<" - but is kind of just

>different syntax for the same - it's hard with this example to avoid

>it.


Maybe the following raii style that encapsulates the enabling /disabling checks? 

 If (optinfo o = push (msg_optimization,...))
  {
     O.print (...) ;
     Destructor of o 'pops' 
  } 

>

>Yet another approach: reworking things to support i18n via a pure

>printf-style interface, using, say "%S" to mean "TDF_SLIM", it could be

>like:

>

>  if (optinfo_enabled_p ())

>    {

>       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

>       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

>         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

>                            "  vectorizing an array ref: %S",

>                            get_tree_code_name (TREE_CODE (aggr_type))

>	                    aggr_type,

>                            DR_BASE_OBJECT (dr));

>      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

>         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

>                            "  vectorizing a vector ref: %S",

>                            get_tree_code_name (TREE_CODE (aggr_type))

>	                    aggr_type,

>                            DR_BASE_OBJECT (dr));

>      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

>         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

>                          "  vectorizing a record based array ref: %S",

>                            get_tree_code_name (TREE_CODE (aggr_type))

>	                    aggr_type,

>                            DR_BASE_OBJECT (dr));

>      else

>         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

>                            "  vectorizing a pointer ref: %S",

>                            get_tree_code_name (TREE_CODE (aggr_type))

>	                    aggr_type,

>                            DR_BASE_OBJECT (dr));

>    }

>

>or somesuch.  The %S would allow for the data to be captured when

>emitting optimization records for the messages (not just plain text,

>e.g. locations of the expressions).


Certainly when exposing things in opt-info we have to be more disciplined. The vectorizer is a bad example here. 
The original goal of having one set of outputs for both dump files and opt-info is good but I guess the result is less than optimal. Maybe additional detail levels would help here (MSG_Dumpfile_only?) 

>So those are some ideas.  I'm sure there are other ways to fix the

>issues with -fopt-info; I'm brainstorming here.


Likewise. As said I applaud the attempt improve the situation but I detest a stream API ;) 

Richard. 

>[...snip...]

>

>Thoughts?

>

>Thanks

>Dave
David Malcolm June 1, 2018, 10:22 p.m. | #4
On Fri, 2018-06-01 at 17:31 +0200, Richard Biener wrote:
> On June 1, 2018 3:40:15 PM GMT+02:00, David Malcolm <dmalcolm@redhat.

> com> wrote:

> > On Fri, 2018-06-01 at 11:50 +0200, Richard Biener wrote:

> > > On Tue, May 29, 2018 at 10:33 PM David Malcolm <dmalcolm@redhat.c

> > > om>

> > > wrote:

> > > > 

> > > > This was an experiment to try to capture information on a

> > > > loop optimization.

> > > > 

> > > > gcc/ChangeLog:

> > > >         * gimple-loop-interchange.cc

> > > > (should_interchange_loops):

> > > > Add

> > > >         optinfo note when interchange gives better data

> > > > locality

> > > > behavior.

> > > >         (tree_loop_interchange::interchange): Add

> > > > OPTINFO_SCOPE.

> > > >         Add optinfo for successful and unsuccessful

> > > > interchanges.

> > > >         (prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add

> > > >         optinfo note.

> > > >         (pass_linterchange::execute): Add OPTINFO_SCOPE.

> > > > ---

> > > >  gcc/gimple-loop-interchange.cc | 36

> > > > +++++++++++++++++++++++++++++++++++-

> > > >  1 file changed, 35 insertions(+), 1 deletion(-)

> > > > 

> > > > diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-

> > > > interchange.cc

> > > > index eb35263..cd32288 100644

> > > > --- a/gcc/gimple-loop-interchange.cc

> > > > +++ b/gcc/gimple-loop-interchange.cc

> > > > @@ -1556,7 +1556,19 @@ should_interchange_loops (unsigned

> > > > i_idx,

> > > > unsigned o_idx,

> > > >    ratio = innermost_loops_p ? INNER_STRIDE_RATIO :

> > > > OUTER_STRIDE_RATIO;

> > > >    /* Do interchange if it gives better data locality

> > > > behavior.  */

> > > >    if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides,

> > > > ratio)))

> > > > -    return true;

> > > > +    {

> > > > +      if (optinfo_enabled_p ())

> > > > +       OPTINFO_NOTE ((gimple *)NULL) // FIXME

> > > > +         << "interchange gives better data locality behavior:

> > > > "

> > > > +         << "iloop_strides: "

> > > > +         << decu (iloop_strides)

> > > > +         << " > (oloop_strides: "

> > > > +         << decu (oloop_strides)

> > > > +         << " * ratio: "

> > > > +         << decu (ratio)

> > > > +         << ")";

> > > 

> > > Just randomly inside the thread.

> > > 

> > > NOOOOOOOOOO!

> > > 

> > > :/

> > > Please do _not_ add more stream-like APIs.  How do you expect

> > > translators to deal with those?

> > > 

> > > Yes, I'm aware of the graphite-* ones and I dislike those very

> > > much.

> > > 

> > > What's wrong with the existing dump API?

> > 

> > The existing API suffers from a "wall of text" problem:

> > 

> > * although it's possible to filter based on various criteria (the

> > optgroup tags, specific passes, success vs failure), it's not

> > possible

> > to filter base on code hotness: the -fopt-info API works purely in

> > terms of location_t.  So all of the messages about the hottest

> > functions in the workload are intermingled with all of the other

> > messages about all of the other functions.

> 

> True

> 

> > * some of the text notes refer to function entry, but all of these

> > are

> > emitted "at the same level": there's no way to see the nesting of

> > these

> > function-entry logs, and where other notes are in relation to

> > them. 

> > For example, in:

> > 

> >  test.c:8:3: note: === analyzing loop ===

> >  test.c:8:3: note: === analyze_loop_nest ===

> >  test.c:8:3: note: === vect_analyze_loop_form ===

> >  test.c:8:3: note: === get_loop_niters ===

> > test.c:8:3: note: symbolic number of iterations is (unsigned int)

> > n_9(D)

> > test.c:8:3: note: not vectorized: loop contains function calls or

> > data

> > references that cannot be analyzed

> >  test.c:8:3: note: vectorized 0 loops in function

> > 

> > there's no way to tell that the "vect_analyze_loop_form" is in fact

> > inside the call to "analyze_loop_nest", and where the "symbolic

> > number

> > of iterations" messages is coming from in relation to them.  This

> > may

> > not seem significant here, but I'm quoting a small example;

> > vectorization typically leads to dozens of messages, with a deep

> > nesting structure (where that structure isn't visible in the -fopt-

> > info

> > 

> > output).

> 

> True. The same issue exists for diagnostics BTW. Indeed, being able

> to collapse 'sections' in dump files, opt-info or diagnostics sounds

> useful. 

> 

> Note that for dump files and opt-info the level argument was sort of

> designed to do that. 


Are you referring to the indentation argument here?

> > 

> > The existing API is throwing data away:

> > 

> > * as noted above, by working purely with a location_t, the

> > execution

> > count isn't associated with the messages.  The output format purely

> > gives file/line/column information, but doesn't cover the inlining

> > chain.   For C++ templates it doesn't specify which instance of a

> > template is being optimized.

> 

> It might be useful to enhance the interface by allowing different

> kind of 'locations'. 


In patch 3 of the kit there's a class optinfo_location, which can be
constructed from:
  * a gimple *, or
  * a basic_block, or
  * a loop *
Hence you can pass in any of the above when an optinfo_location is
required.  Potentially there could also be a constructor taking an
rtx_insn * (though I'm primarily interested in gimple passes here,
especially inlining and vectorization).

Internally it's currently stored as a gimple *, but I guess it could be
, say, a basic_block and a location_t.

> > * there's no metadata about where the messages are coming

> > from.  It's

> > easy to get at the current pass internally, but the messages don't

> > capture that.  Figuring out where a message came from requires

> > grepping

> > the GCC source code.  The prototype I posted captures the __FILE__

> > and

> > __LINE__ within the gcc source for every message emitted, and which

> > pass instance emitted it.

> 

> The opt info group was supposed to captures this to the level

> interesting for a user. 


A question here is who the "user" is.  I'm aiming this both at GCC
developers, and technically-sophisticated end-users.  As a member of
the former category, I'd love to have an easy way to go from a message
to the line of code that emitted it.

The optinfo group seems to be *very* high level: "is this a "loop"
message, or a "vec" message?" etc.  That is useful too (one of the good
things about the existing system).

> > * The current output format is of the form:

> >     "FILE:LINE:COLUMN: free-form text\n"

> > This is only machine-readable up to a point: if a program is

> > parsing

> > it, all it has is the free-form text.  The prototype I posted

> > captures

> > what kinds of things are in the text (statements, trees,

> > symtab_nodes),

> > and captures location information for them, so that visualizations

> > of

> > the dumps can provide useful links.

> > 

> > There's no API-level grouping of messages, beyond looking for

> > newline

> > characters.

> > 

> > I'm probably re-hashing a lot of the material in the cover letter

> > here:

> > "[PATCH 00/10] RFC: Prototype of compiler-assisted performance

> > analysis"

> >  https://gcc.gnu.org/ml/gcc-patches/2018-05/msg01675.html

> > 

> > 

> > I'd like to provide a machine-readable output format that covers

> > the

> > above - in particular the profile data (whilst retaining -fopt-info 

> > for

> > compatibility).  Separation of the data from its presentation.

> > 

> > Clearly you don't like the stream-like API from the prototype :)

> 

> Yes :) I wasn't so much complaining about the content but the

> presentation /API. 


(nods)

> > So I'm wondering what the API ought to look like, one that would

> > allow

> > for the kind of "richer" machine-readable output.

> > 

> > Consider this "-fopt-info" code (from "vect_create_data_ref_ptr";

> > this

> > is example 2 from the cover-letter):

> > 

> >  if (dump_enabled_p ())

> >     {

> >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> >       dump_printf_loc (MSG_NOTE, vect_location,

> >                        "create %s-pointer variable to type: ",

> >                        get_tree_code_name (TREE_CODE (aggr_type)));

> >       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);

> >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> >         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");

> >       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> >         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");

> >       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> >    dump_printf (MSG_NOTE, "  vectorizing a record based array ref:

> > ");

> >       else

> >         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");

> >       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));

> >       dump_printf (MSG_NOTE, "\n");

> >     }

> > 

> > where the information is built up piecewise, with conditional

> > logic.

> > (This existing code isn't particularly amenable to translation

> > either).

> > 

> > One option would be for "vect_location" to become something other

> > than

> > a location_t, from which a execution count can be gained (e.g. a

> > gimple

> > *, from which the location_t and the BB can be accessed).

> 

> Yes. Like a container that can be initiated from other kind of

> contexts. 


(like the optinfo_location class described above).

So this might be something like:

extern void dump_printf_loc (optinfo_groups_t,
                             optinfo_location,
                             const char *fmt,
                             ...);

or somesuch.

>   Then

> > "dump_printf_loc" would signify starting an optimization record,

> > and

> > the final "\n" would signify the end of an optimization record; the

> > various dump_generic_expr and dump_printf would add structured

> > information and text entries to theoptimization record.

> 

> A push/pop style API would maybe work as well. (pushing a level with

> some meta data) 


Patch 3 in the kit has an OPTINFO_SCOPE macro which uses a RAII class
to automatically do push/pops.

> > This has the advantage of retaining the look of the existing API

> > (so

> > the existing callsites don't need changing), but it changes their

> > meaning, so that as well as writing to -fopt-info's destinations,

> > it's

> > also in a sense parsing the dump_* calls and building optimization

> > records from them.

> > 

> > AIUI, dump_printf_loc can't be a macro, as it's variadic, so this

> > approach doesn't allow for capturing the location within GCC for

> > where

> > the message is emitted.

> 

> True, though we have __builtin_FILE and friends that can be used as

> default args. 


If I'm understanding the idea, this means relying on implicit use of
default arguments.   I'm not sure how compatible that is with variadic
functions: the ellipsis has to come last.

I thought it was impossible to have default args with a variadic
function, but it seems that this is syntactically valid:

extern void dump_printf_loc (optinfo_groups_t,
                             optinfo_location,
                             const char *fmt,
	                     const char *impl_file = __builtin_FILE (),
	                     int impl_line = __builtin_LINE (),
                             ...);

That said, the above decl seems like a bad idea: a recipe for nasty
surprises (what happens if the format arguments expect a const char *
and an int?).

Idea: maybe the optinfo_location constructor could take the default
args for a __builtin_FILE and __builtin_LINE?  Or the pending_optinfo
class from patch 3 of the kit.  I'll experiment with this.

> Note a push/pop level API can also buffer intermediate printf style

> output and annotate/indent it (supporting a dump file emacs/vim mode

> that can do collapse and expand) 


Interesting idea.

I had a go at emitting text in Emacs outline-mode format.  An example,
showing the source location and pass/hotness metadata only for top-
level messages:
 https://dmalcolm.fedorapeople.org/gcc/2018-06-01/outline-elided.txt
and another example, showing them for all messages:
 https://dmalcolm.fedorapeople.org/gcc/2018-06-01/outline-unelided.txt

and it works as-is with Emacs outline-mode (though I don't know how
useful it is; would want a jump-to-source option etc etc).

In both cases, this is prioritizing the messages, sorting from hottest
code down to coldest code (or those messages emitted before the profile
data was loaded), similar to the HTML report here:
https://dmalcolm.fedorapeople.org/gcc/2018-05-18/pgo-demo-test/pgo-demo-test/

(These are all being generated by reading the saved optimization
records, rather than being emitted by the compiler itself)

> > Another approach: there could be something like:

> > 

> >  if (optinfo_enabled_p ())

> >    {

> >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> >       OPTINFO_VECT_NOTE note;

> >       note.printf ("create %s-pointer variable to type: ",

> >                    get_tree_code_name (TREE_CODE (aggr_type)));

> >       note.add_slim (aggr_type);

> >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> >         note.printf ("  vectorizing an array ref: ");

> >       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> >         note.printf (MSG_NOTE, "  vectorizing a vector ref: ");

> >       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> >    note.printf (MSG_NOTE, "  vectorizing a record based array ref:

> > ");

> >       else

> >         note.printf (MSG_NOTE, "  vectorizing a pointer ref: ");

> >       note.add_slim (DR_BASE_OBJECT (dr));

> >    }

> > 

> > which uses a macro to get the gcc source location, and avoids the

> > special meaning for "\n".  This avoids the "<<" - but is kind of

> > just

> > different syntax for the same - it's hard with this example to

> > avoid

> > it.

> 

> Maybe the following raii style that encapsulates the enabling

> /disabling checks? 

> 

>  If (optinfo o = push (msg_optimization,...))

>   {

>      O.print (...) ;

>      Destructor of o 'pops' 

>   } 


I'm assuming that:
* very few users turn on the dump feature, and 
* a goal is that we shouldn't slow down that common "no dumping" case
when implementing dumping.

If so, then presumably we need a really cheap test that can easily be
marked as cold, or optimized away.

How much work would be done by the call to:
   push (msg_optimization,...),
in particular evaluating the arguments?

I was thinking:
   if (optinfo_enabled_p ())
and have it be a very cheap boolean lookup (though it isn't in the
current patch kit), with everything else guarded by it.

Would a GCC_UNLIKELY(expr) macro be appropriate here?  (so that non-PGO 
builds of the compiler can have all this dump stuff moved into cold
sections)

> > Yet another approach: reworking things to support i18n via a pure

> > printf-style interface, using, say "%S" to mean "TDF_SLIM", it

> > could be

> > like:

> > 

> >  if (optinfo_enabled_p ())

> >    {

> >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> >                            "  vectorizing an array ref: %S",

> >                            get_tree_code_name (TREE_CODE

> > (aggr_type))

> > 	                    aggr_type,

> >                            DR_BASE_OBJECT (dr));

> >      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> >                            "  vectorizing a vector ref: %S",

> >                            get_tree_code_name (TREE_CODE

> > (aggr_type))

> > 	                    aggr_type,

> >                            DR_BASE_OBJECT (dr));

> >      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> >                          "  vectorizing a record based array ref:

> > %S",

> >                            get_tree_code_name (TREE_CODE

> > (aggr_type))

> > 	                    aggr_type,

> >                            DR_BASE_OBJECT (dr));

> >      else

> >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> >                            "  vectorizing a pointer ref: %S",

> >                            get_tree_code_name (TREE_CODE

> > (aggr_type))

> > 	                    aggr_type,

> >                            DR_BASE_OBJECT (dr));

> >    }

> > 

> > or somesuch.  The %S would allow for the data to be captured when

> > emitting optimization records for the messages (not just plain

> > text,

> > e.g. locations of the expressions).

> 

> Certainly when exposing things in opt-info we have to be more

> disciplined. The vectorizer is a bad example here. 

> The original goal of having one set of outputs for both dump files

> and opt-info is good but I guess the result is less than optimal.

> Maybe additional detail levels would help here (MSG_Dumpfile_only?) 

> 

> > So those are some ideas.  I'm sure there are other ways to fix the

> > issues with -fopt-info; I'm brainstorming here.

> 

> Likewise. As said I applaud the attempt improve the situation but I

> detest a stream API ;) 


Thanks for the feedback.  I'll continue to try prototyping ideas.

Dave

> Richard. 

> 

> > [...snip...]

> > 

> > Thoughts?

> > 

> > Thanks

> > Dave

> 

>
Richard Biener June 4, 2018, 1:20 p.m. | #5
On Sat, Jun 2, 2018 at 12:22 AM David Malcolm <dmalcolm@redhat.com> wrote:
>

> On Fri, 2018-06-01 at 17:31 +0200, Richard Biener wrote:

> > On June 1, 2018 3:40:15 PM GMT+02:00, David Malcolm <dmalcolm@redhat.

> > com> wrote:

> > > On Fri, 2018-06-01 at 11:50 +0200, Richard Biener wrote:

> > > > On Tue, May 29, 2018 at 10:33 PM David Malcolm <dmalcolm@redhat.c

> > > > om>

> > > > wrote:

> > > > >

> > > > > This was an experiment to try to capture information on a

> > > > > loop optimization.

> > > > >

> > > > > gcc/ChangeLog:

> > > > >         * gimple-loop-interchange.cc

> > > > > (should_interchange_loops):

> > > > > Add

> > > > >         optinfo note when interchange gives better data

> > > > > locality

> > > > > behavior.

> > > > >         (tree_loop_interchange::interchange): Add

> > > > > OPTINFO_SCOPE.

> > > > >         Add optinfo for successful and unsuccessful

> > > > > interchanges.

> > > > >         (prepare_perfect_loop_nest): Add OPTINFO_SCOPE.  Add

> > > > >         optinfo note.

> > > > >         (pass_linterchange::execute): Add OPTINFO_SCOPE.

> > > > > ---

> > > > >  gcc/gimple-loop-interchange.cc | 36

> > > > > +++++++++++++++++++++++++++++++++++-

> > > > >  1 file changed, 35 insertions(+), 1 deletion(-)

> > > > >

> > > > > diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-

> > > > > interchange.cc

> > > > > index eb35263..cd32288 100644

> > > > > --- a/gcc/gimple-loop-interchange.cc

> > > > > +++ b/gcc/gimple-loop-interchange.cc

> > > > > @@ -1556,7 +1556,19 @@ should_interchange_loops (unsigned

> > > > > i_idx,

> > > > > unsigned o_idx,

> > > > >    ratio = innermost_loops_p ? INNER_STRIDE_RATIO :

> > > > > OUTER_STRIDE_RATIO;

> > > > >    /* Do interchange if it gives better data locality

> > > > > behavior.  */

> > > > >    if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides,

> > > > > ratio)))

> > > > > -    return true;

> > > > > +    {

> > > > > +      if (optinfo_enabled_p ())

> > > > > +       OPTINFO_NOTE ((gimple *)NULL) // FIXME

> > > > > +         << "interchange gives better data locality behavior:

> > > > > "

> > > > > +         << "iloop_strides: "

> > > > > +         << decu (iloop_strides)

> > > > > +         << " > (oloop_strides: "

> > > > > +         << decu (oloop_strides)

> > > > > +         << " * ratio: "

> > > > > +         << decu (ratio)

> > > > > +         << ")";

> > > >

> > > > Just randomly inside the thread.

> > > >

> > > > NOOOOOOOOOO!

> > > >

> > > > :/

> > > > Please do _not_ add more stream-like APIs.  How do you expect

> > > > translators to deal with those?

> > > >

> > > > Yes, I'm aware of the graphite-* ones and I dislike those very

> > > > much.

> > > >

> > > > What's wrong with the existing dump API?

> > >

> > > The existing API suffers from a "wall of text" problem:

> > >

> > > * although it's possible to filter based on various criteria (the

> > > optgroup tags, specific passes, success vs failure), it's not

> > > possible

> > > to filter base on code hotness: the -fopt-info API works purely in

> > > terms of location_t.  So all of the messages about the hottest

> > > functions in the workload are intermingled with all of the other

> > > messages about all of the other functions.

> >

> > True

> >

> > > * some of the text notes refer to function entry, but all of these

> > > are

> > > emitted "at the same level": there's no way to see the nesting of

> > > these

> > > function-entry logs, and where other notes are in relation to

> > > them.

> > > For example, in:

> > >

> > >  test.c:8:3: note: === analyzing loop ===

> > >  test.c:8:3: note: === analyze_loop_nest ===

> > >  test.c:8:3: note: === vect_analyze_loop_form ===

> > >  test.c:8:3: note: === get_loop_niters ===

> > > test.c:8:3: note: symbolic number of iterations is (unsigned int)

> > > n_9(D)

> > > test.c:8:3: note: not vectorized: loop contains function calls or

> > > data

> > > references that cannot be analyzed

> > >  test.c:8:3: note: vectorized 0 loops in function

> > >

> > > there's no way to tell that the "vect_analyze_loop_form" is in fact

> > > inside the call to "analyze_loop_nest", and where the "symbolic

> > > number

> > > of iterations" messages is coming from in relation to them.  This

> > > may

> > > not seem significant here, but I'm quoting a small example;

> > > vectorization typically leads to dozens of messages, with a deep

> > > nesting structure (where that structure isn't visible in the -fopt-

> > > info

> > >

> > > output).

> >

> > True. The same issue exists for diagnostics BTW. Indeed, being able

> > to collapse 'sections' in dump files, opt-info or diagnostics sounds

> > useful.

> >

> > Note that for dump files and opt-info the level argument was sort of

> > designed to do that.

>

> Are you referring to the indentation argument here?


No, to MSG_NOTE vs. MSG_MISSED_OPTIMIZATION , etc.

> > >

> > > The existing API is throwing data away:

> > >

> > > * as noted above, by working purely with a location_t, the

> > > execution

> > > count isn't associated with the messages.  The output format purely

> > > gives file/line/column information, but doesn't cover the inlining

> > > chain.   For C++ templates it doesn't specify which instance of a

> > > template is being optimized.

> >

> > It might be useful to enhance the interface by allowing different

> > kind of 'locations'.

>

> In patch 3 of the kit there's a class optinfo_location, which can be

> constructed from:

>   * a gimple *, or

>   * a basic_block, or

>   * a loop *

> Hence you can pass in any of the above when an optinfo_location is

> required.  Potentially there could also be a constructor taking an

> rtx_insn * (though I'm primarily interested in gimple passes here,

> especially inlining and vectorization).

>

> Internally it's currently stored as a gimple *, but I guess it could be

> , say, a basic_block and a location_t.


Ok, so that indeed sounds what I had in mind - change the dump_*_loc
interface to take this kind of class rather than (only) a location_t.

> > > * there's no metadata about where the messages are coming

> > > from.  It's

> > > easy to get at the current pass internally, but the messages don't

> > > capture that.  Figuring out where a message came from requires

> > > grepping

> > > the GCC source code.  The prototype I posted captures the __FILE__

> > > and

> > > __LINE__ within the gcc source for every message emitted, and which

> > > pass instance emitted it.

> >

> > The opt info group was supposed to captures this to the level

> > interesting for a user.

>

> A question here is who the "user" is.  I'm aiming this both at GCC

> developers, and technically-sophisticated end-users.  As a member of

> the former category, I'd love to have an easy way to go from a message

> to the line of code that emitted it.

>

> The optinfo group seems to be *very* high level: "is this a "loop"

> message, or a "vec" message?" etc.  That is useful too (one of the good

> things about the existing system).


It somewhat mimics what other compilers offer.  Then there's the level
from above but it is very coarse-grained and mixing things like
detail (MSG_NOTE) with kind (OPTIMIZED_LOCATIONS vs. UNOPTIMIZED one).

So what would be useful I guess is to somehow split this so we can have
a level MSG_ANALYSIS (and drop MSG_UNOPTIMIZED_LOCATIONS?) and
a separate detail level.  So we could start information about an analysis
phase (thus grouping related info), dump what failed, dump details.

The hard part here is of course taking existing verbose dumps and try to
turn them into sth usable for users.  The vectorizer dumpfile is not
really informative unless you look at the source and mix&match the
dumping code with the surroundings...

> > > * The current output format is of the form:

> > >     "FILE:LINE:COLUMN: free-form text\n"

> > > This is only machine-readable up to a point: if a program is

> > > parsing

> > > it, all it has is the free-form text.  The prototype I posted

> > > captures

> > > what kinds of things are in the text (statements, trees,

> > > symtab_nodes),

> > > and captures location information for them, so that visualizations

> > > of

> > > the dumps can provide useful links.

> > >

> > > There's no API-level grouping of messages, beyond looking for

> > > newline

> > > characters.

> > >

> > > I'm probably re-hashing a lot of the material in the cover letter

> > > here:

> > > "[PATCH 00/10] RFC: Prototype of compiler-assisted performance

> > > analysis"

> > >  https://gcc.gnu.org/ml/gcc-patches/2018-05/msg01675.html

> > >

> > >

> > > I'd like to provide a machine-readable output format that covers

> > > the

> > > above - in particular the profile data (whilst retaining -fopt-info

> > > for

> > > compatibility).  Separation of the data from its presentation.

> > >

> > > Clearly you don't like the stream-like API from the prototype :)

> >

> > Yes :) I wasn't so much complaining about the content but the

> > presentation /API.

>

> (nods)

>

> > > So I'm wondering what the API ought to look like, one that would

> > > allow

> > > for the kind of "richer" machine-readable output.

> > >

> > > Consider this "-fopt-info" code (from "vect_create_data_ref_ptr";

> > > this

> > > is example 2 from the cover-letter):

> > >

> > >  if (dump_enabled_p ())

> > >     {

> > >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> > >       dump_printf_loc (MSG_NOTE, vect_location,

> > >                        "create %s-pointer variable to type: ",

> > >                        get_tree_code_name (TREE_CODE (aggr_type)));

> > >       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);

> > >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> > >         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");

> > >       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> > >         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");

> > >       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> > >    dump_printf (MSG_NOTE, "  vectorizing a record based array ref:

> > > ");

> > >       else

> > >         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");

> > >       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));

> > >       dump_printf (MSG_NOTE, "\n");

> > >     }

> > >

> > > where the information is built up piecewise, with conditional

> > > logic.

> > > (This existing code isn't particularly amenable to translation

> > > either).

> > >

> > > One option would be for "vect_location" to become something other

> > > than

> > > a location_t, from which a execution count can be gained (e.g. a

> > > gimple

> > > *, from which the location_t and the BB can be accessed).

> >

> > Yes. Like a container that can be initiated from other kind of

> > contexts.

>

> (like the optinfo_location class described above).


Yes.

> So this might be something like:

>

> extern void dump_printf_loc (optinfo_groups_t,

>                              optinfo_location,

>                              const char *fmt,

>                              ...);

>

> or somesuch.


Yeah.

> >   Then

> > > "dump_printf_loc" would signify starting an optimization record,

> > > and

> > > the final "\n" would signify the end of an optimization record; the

> > > various dump_generic_expr and dump_printf would add structured

> > > information and text entries to theoptimization record.

> >

> > A push/pop style API would maybe work as well. (pushing a level with

> > some meta data)

>

> Patch 3 in the kit has an OPTINFO_SCOPE macro which uses a RAII class

> to automatically do push/pops.

>

> > > This has the advantage of retaining the look of the existing API

> > > (so

> > > the existing callsites don't need changing), but it changes their

> > > meaning, so that as well as writing to -fopt-info's destinations,

> > > it's

> > > also in a sense parsing the dump_* calls and building optimization

> > > records from them.

> > >

> > > AIUI, dump_printf_loc can't be a macro, as it's variadic, so this

> > > approach doesn't allow for capturing the location within GCC for

> > > where

> > > the message is emitted.

> >

> > True, though we have __builtin_FILE and friends that can be used as

> > default args.

>

> If I'm understanding the idea, this means relying on implicit use of

> default arguments.   I'm not sure how compatible that is with variadic

> functions: the ellipsis has to come last.


True.

> I thought it was impossible to have default args with a variadic

> function, but it seems that this is syntactically valid:

>

> extern void dump_printf_loc (optinfo_groups_t,

>                              optinfo_location,

>                              const char *fmt,

>                              const char *impl_file = __builtin_FILE (),

>                              int impl_line = __builtin_LINE (),

>                              ...);

>

> That said, the above decl seems like a bad idea: a recipe for nasty

> surprises (what happens if the format arguments expect a const char *

> and an int?).


I'm surprised the above works ;)  Does it do what we expect?

> Idea: maybe the optinfo_location constructor could take the default

> args for a __builtin_FILE and __builtin_LINE?  Or the pending_optinfo

> class from patch 3 of the kit.  I'll experiment with this.


Ah, nice idea, yes.

> > Note a push/pop level API can also buffer intermediate printf style

> > output and annotate/indent it (supporting a dump file emacs/vim mode

> > that can do collapse and expand)

>

> Interesting idea.

>

> I had a go at emitting text in Emacs outline-mode format.  An example,

> showing the source location and pass/hotness metadata only for top-

> level messages:

>  https://dmalcolm.fedorapeople.org/gcc/2018-06-01/outline-elided.txt

> and another example, showing them for all messages:

>  https://dmalcolm.fedorapeople.org/gcc/2018-06-01/outline-unelided.txt

>

> and it works as-is with Emacs outline-mode (though I don't know how

> useful it is; would want a jump-to-source option etc etc).

>

> In both cases, this is prioritizing the messages, sorting from hottest

> code down to coldest code (or those messages emitted before the profile

> data was loaded), similar to the HTML report here:

> https://dmalcolm.fedorapeople.org/gcc/2018-05-18/pgo-demo-test/pgo-demo-test/

>

> (These are all being generated by reading the saved optimization

> records, rather than being emitted by the compiler itself)

>

> > > Another approach: there could be something like:

> > >

> > >  if (optinfo_enabled_p ())

> > >    {

> > >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> > >       OPTINFO_VECT_NOTE note;

> > >       note.printf ("create %s-pointer variable to type: ",

> > >                    get_tree_code_name (TREE_CODE (aggr_type)));

> > >       note.add_slim (aggr_type);

> > >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> > >         note.printf ("  vectorizing an array ref: ");

> > >       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> > >         note.printf (MSG_NOTE, "  vectorizing a vector ref: ");

> > >       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> > >    note.printf (MSG_NOTE, "  vectorizing a record based array ref:

> > > ");

> > >       else

> > >         note.printf (MSG_NOTE, "  vectorizing a pointer ref: ");

> > >       note.add_slim (DR_BASE_OBJECT (dr));

> > >    }

> > >

> > > which uses a macro to get the gcc source location, and avoids the

> > > special meaning for "\n".  This avoids the "<<" - but is kind of

> > > just

> > > different syntax for the same - it's hard with this example to

> > > avoid

> > > it.

> >

> > Maybe the following raii style that encapsulates the enabling

> > /disabling checks?

> >

> >  If (optinfo o = push (msg_optimization,...))

> >   {

> >      O.print (...) ;

> >      Destructor of o 'pops'

> >   }

>

> I'm assuming that:

> * very few users turn on the dump feature, and

> * a goal is that we shouldn't slow down that common "no dumping" case

> when implementing dumping.


Yes.

> If so, then presumably we need a really cheap test that can easily be

> marked as cold, or optimized away.

>

> How much work would be done by the call to:

>    push (msg_optimization,...),

> in particular evaluating the arguments?


I'd expect it to bail out quickly when !enabled()

> I was thinking:

>    if (optinfo_enabled_p ())

> and have it be a very cheap boolean lookup (though it isn't in the

> current patch kit), with everything else guarded by it.


Yeah.

> Would a GCC_UNLIKELY(expr) macro be appropriate here?  (so that non-PGO

> builds of the compiler can have all this dump stuff moved into cold

> sections)


Not sure, we could certainly experiment with that.

> > > Yet another approach: reworking things to support i18n via a pure

> > > printf-style interface, using, say "%S" to mean "TDF_SLIM", it

> > > could be

> > > like:

> > >

> > >  if (optinfo_enabled_p ())

> > >    {

> > >       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));

> > >       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)

> > >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> > >                            "  vectorizing an array ref: %S",

> > >                            get_tree_code_name (TREE_CODE

> > > (aggr_type))

> > >                         aggr_type,

> > >                            DR_BASE_OBJECT (dr));

> > >      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)

> > >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> > >                            "  vectorizing a vector ref: %S",

> > >                            get_tree_code_name (TREE_CODE

> > > (aggr_type))

> > >                         aggr_type,

> > >                            DR_BASE_OBJECT (dr));

> > >      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)

> > >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> > >                          "  vectorizing a record based array ref:

> > > %S",

> > >                            get_tree_code_name (TREE_CODE

> > > (aggr_type))

> > >                         aggr_type,

> > >                            DR_BASE_OBJECT (dr));

> > >      else

> > >         OPTINFO_VECT_NOTE ("create %s-pointer variable to type: %S"

> > >                            "  vectorizing a pointer ref: %S",

> > >                            get_tree_code_name (TREE_CODE

> > > (aggr_type))

> > >                         aggr_type,

> > >                            DR_BASE_OBJECT (dr));

> > >    }

> > >

> > > or somesuch.  The %S would allow for the data to be captured when

> > > emitting optimization records for the messages (not just plain

> > > text,

> > > e.g. locations of the expressions).

> >

> > Certainly when exposing things in opt-info we have to be more

> > disciplined. The vectorizer is a bad example here.

> > The original goal of having one set of outputs for both dump files

> > and opt-info is good but I guess the result is less than optimal.

> > Maybe additional detail levels would help here (MSG_Dumpfile_only?)

> >

> > > So those are some ideas.  I'm sure there are other ways to fix the

> > > issues with -fopt-info; I'm brainstorming here.

> >

> > Likewise. As said I applaud the attempt improve the situation but I

> > detest a stream API ;)

>

> Thanks for the feedback.  I'll continue to try prototyping ideas.


Thanks a lot.

Note I think we can get incremental improvements, like committing
the enum change and a wrapper class for the location while keeping
the rest of the opt-info API.

Richard.

> Dave

>

> > Richard.

> >

> > > [...snip...]

> > >

> > > Thoughts?

> > >

> > > Thanks

> > > Dave

> >

> >

Patch

diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index eb35263..cd32288 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -1556,7 +1556,19 @@  should_interchange_loops (unsigned i_idx, unsigned o_idx,
   ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
   /* Do interchange if it gives better data locality behavior.  */
   if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
-    return true;
+    {
+      if (optinfo_enabled_p ())
+	OPTINFO_NOTE ((gimple *)NULL) // FIXME
+	  << "interchange gives better data locality behavior: "
+	  << "iloop_strides: "
+	  << decu (iloop_strides)
+	  << " > (oloop_strides: "
+	  << decu (oloop_strides)
+	  << " * ratio: "
+	  << decu (ratio)
+	  << ")";
+      return true;
+    }
   if (wi::gtu_p (iloop_strides, oloop_strides))
     {
       /* Or it creates more invariant memory references.  */
@@ -1578,6 +1590,8 @@  bool
 tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
 				    vec<ddr_p> ddrs)
 {
+  OPTINFO_SCOPE ("tree_loop_interchange::interchange", m_loop_nest[0]);
+
   location_t loc = find_loop_location (m_loop_nest[0]);
   bool changed_p = false;
   /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
@@ -1628,6 +1642,10 @@  tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
 	    fprintf (dump_file,
 		     "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
 		     oloop.m_loop->num, iloop.m_loop->num);
+	  if (optinfo_enabled_p ())
+	    OPTINFO_SUCCESS (oloop.m_loop)
+	      << optinfo_printf ("Loop_pair<outer:%d, inner:%d> is interchanged",
+				 oloop.m_loop->num, iloop.m_loop->num);
 
 	  changed_p = true;
 	  interchange_loops (iloop, oloop);
@@ -1641,6 +1659,10 @@  tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
 	    fprintf (dump_file,
 		     "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
 		     oloop.m_loop->num, iloop.m_loop->num);
+	  if (optinfo_enabled_p ())
+	    OPTINFO_FAILURE (oloop.m_loop)
+	      << optinfo_printf ("Loop_pair<outer:%d, inner:%d> is not interchanged",
+				 oloop.m_loop->num, iloop.m_loop->num);
 	}
     }
   simple_dce_from_worklist (m_dce_seeds);
@@ -1648,6 +1670,9 @@  tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
   if (changed_p)
     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
 		     "loops interchanged in loop nest\n");
+  if (optinfo_enabled_p ())
+    OPTINFO_SUCCESS (m_loop_nest[0])
+      << "loops interchanged in loop nest";
 
   return changed_p;
 }
@@ -1971,6 +1996,8 @@  static bool
 prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,
 			   vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
 {
+  OPTINFO_SCOPE ("prepare_perfect_loop_nest", loop);
+
   struct loop *start_loop = NULL, *innermost = loop;
   struct loop *outermost = loops_for_fn (cfun)->tree_root;
 
@@ -2029,6 +2056,12 @@  prepare_perfect_loop_nest (struct loop *loop, vec<loop_p> *loop_nest,
 	  fprintf (dump_file,
 		   "\nConsider loop interchange for loop_nest<%d - %d>\n",
 		   start_loop->num, innermost->num);
+	if (optinfo_enabled_p ())
+	  {
+	    OPTINFO_NOTE (start_loop)
+	      << optinfo_printf ("consider loop interchange for loop_nest<%d - %d>",
+				 start_loop->num, innermost->num);
+	  }
 
 	if (loop != start_loop)
 	  prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
@@ -2061,6 +2094,7 @@  pass_linterchange::execute (function *fun)
   struct loop *loop;
   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
     {
+      OPTINFO_SCOPE ("considering loop for interchange", loop);
       vec<loop_p> loop_nest = vNULL;
       vec<data_reference_p> datarefs = vNULL;
       vec<ddr_p> ddrs = vNULL;