perl/regen/HeaderParser.pm
Karl Williamson c0955b6422 regen/HeaderParser: _reduce_conds: Return more than a bool
This changes this function to stringify the result into a preprocessor
conditional expression, instead of just a bool 0 or 1.  This gives the
caller more information.

This doesn't change the outcome of callers who are expecting a boolean,
as any string now returned evaluates to true.
2026-01-22 09:55:58 -07:00

2176 lines
72 KiB
Perl

package HeaderParser;
use strict;
use warnings;
BEGIN {
if ($] < 5.010) {
die <<EOFMSG;
The regen code in the perl repo needs at least version 5.10 to run.
You are currently using $], we recommend you install a more modern
version of perl before you try to build this version of perl.
EOFMSG
}
}
# these are required below in BEGIN statements, we cant have a
# hard dependency on them as they might not be available when
# we run as part of autodoc.pl
#
# use Data::Dumper;
# use Storable qw(dclone);
#
use Carp qw(confess);
use Text::Tabs qw(expand unexpand);
use Text::Wrap qw(wrap);
# The style of this file is determined by:
#
# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \
# -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2
my (
%unop, # unary operators and their precedence
%binop, # binary operators and their precedence
%is_right_assoc, # operators which are right associative
%precedence, # precedence of all operators.
%associative, # associative operators
%commutative, # commutative operators
%cmpop, # comparison operators
$unop_pat, # pattern to match unary operators
$binop_pat, # pattern to match binary operators
%op_names, # map of op to description, used in error messages
$tokenize_pat # a pattern which can tokenize an expression
);
BEGIN {
# this is initialization for the operator precedence expression parser
# we use for handling preprocessor conditions.
%op_names= (
'==' => 'equality',
'!=' => 'inequality',
'<<' => 'bit-shift-left',
'>>' => 'bit-shift-right',
'+' => 'addition',
'-' => 'subtraction',
'*' => 'multiplication',
'/' => 'division',
'%' => 'modulo',
'||' => 'logical-or', # Lowest precedence
'&&' => 'logical-and',
'|' => 'binary-or',
'^' => 'binary-xor',
'&' => 'binary-and',
'<' => 'less-than', # split on spaces, all with equal precedence
'>' => 'greater-than',
'<=' => 'less-than-or-equal',
'>=' => 'greater-than-or-equal',
);
my @cmpop= (
'== !=', # listed in lowest to highest precedence
'< > <= >=', # split on spaces, all with equal precedence
);
my @binop= (
'||', # Lowest precedence
'&&',
'|',
'^',
'&',
@cmpop, # include the numerical comparison operators.
'<< >>',
'+ -',
'* / %', # highest precedence operators.
);
my @unop= qw( ! ~ + - );
%unop= map { $_ => 1 } @unop;
%cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop;
%binop= map { $_ => 1 } map { split /\s+/, $_ } @binop;
my $make_pat= sub {
my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b }
map quotemeta($_), @_;
return qr/$pat/;
};
$unop_pat= $make_pat->(@unop);
foreach my $ix (0 .. $#binop) {
my $sym= $binop[$ix];
$precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym;
}
$is_right_assoc{"?"}= 1;
$is_right_assoc{":"}= 1;
$precedence{"?"}= 1;
$precedence{":"}= 0;
$associative{$_}++
for qw( || && + *); # we leave '==' out so we don't reorder terms
$commutative{$_}++ for qw( || && + *);
$binop_pat= $make_pat->(keys %precedence);
# Note below that we don't use the 'multichar' capture currently
# but, in a future patch we will add support for warning about
# non-portable constructs like multichar constants, so I have added
# the tokenizer support for it here so it is ready later.
my $sq_const_pat = qr/[^\\']|\\(?:['"\\?abfnrtv]|[0-7]{1,3}|x[0-9A-Fa-f]+)/;
$tokenize_pat= qr/
^(?:
(?<comment> \/\*.*?\*\/ )
| (?<ws> \s+ )
| (?<term>
(?<literal>
(?<define> defined\(\w+\) )
| (?<func> \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) )
| (?<const> (?:0x[a-fA-F0-9]+
|-?\d+[LUlu]*
|'$sq_const_pat
(?<multichar>$sq_const_pat+)?'
)
)
| (?<sym> \w+ )
)
| (?<op> $binop_pat | $unop_pat )
| (?<paren> [\(\)] )
)
)
/xs;
}
# dump the arguments with dump. wraps loading Dumper
# as we are executed by miniperl where Dumper isnt available
sub dd {
my $self= shift;
local $self->{orig_content};
my $ret= "(dump not available)";
eval {
require Data::Dumper;
$ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump();
};
return $ret;
}
my $has_storable;
# same story here, in miniperl we use slow perl code,
# in real perl we can use Storable and speed things up.
BEGIN { eval "use Storable; \$has_storable=1;" }
# recursively copy an AoAoA...
sub copy_aoa {
my ($aoa)= @_;
if ($has_storable) {
return Storable::dclone($aoa);
}
else {
return _copy_aoa($aoa);
}
}
sub _copy_aoa {
my ($thing)= @_;
if (ref $thing) {
return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ];
}
else {
return $thing;
}
}
# return the number characters that should go in between a '#' and
# the name of a c preprocessor directive. Returns 0 spaces for level
# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc)
# This might sound weird, but consider these are tab *stops* and the
# '#' is included in the total. which means indents of 2, 4, 6 etc.
sub indent_chars {
my ($self, $level)= @_;
my $ind= "";
$ind .= " " if $level;
$ind .= " " x ($level - 1) if $level > 1;
return $ind;
}
# we use OO to store state, etc.
sub new {
my ($class, %args)= @_;
$args{add_commented_expr_after} //= 10;
$args{max_width} //= 78;
$args{min_break_width} //= 70;
$args{indent_define} //= 1;
$args{hug_define} //= 0;
return bless \%args,;
}
# this parses the expression into an array of tokens
# this is somewhat crude, we could do this incrementally
# if we wanted and avoid the overhead. but it makes it
# easier to debug the tokenizer.
sub _tokenize_expr {
my ($self, $expr)= @_;
delete $self->{tokens};
delete $self->{parse_tree};
$self->{original_expr}= $expr;
my @tokens;
while ($expr =~ s/$tokenize_pat//xs) {
push @tokens, {%+} if defined $+{'term'};
}
$self->{tokens}= \@tokens;
warn $self->dd($self) if $self->{debug};
if (length $expr) {
confess "Failed to tokenize_expr: $expr\n";
}
return \@tokens;
}
sub _count_ops {
my ($self, $term)= @_;
my $count= 0;
$count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx;
return $count;
}
# sort terms in an expression in a way that puts things
# in a sensible order. Anything starting with PERL_IN_
# should be on the left in alphabetical order. Digits
# should be on the right (eg 0), and ties are resolved
# by stripping non-alpha-numeric, thus removing underbar
# parens, spaces, logical operators, etc, and then by
# lc comparison of the result.
sub _sort_terms {
my $self= shift;
my (@terms)= map {
[
$_, # 0: raw
lc($_) =~ s/[^a-zA-Z0-9]//gr, # 1: "_" stripped and caseless
$_ =~ m/PERL_IN_/ ? 1 : 0, # 2: PERL_IN_ labeled define
$_ =~ m/^\d/ ? 1 : 0, # 3: digit
$_ =~ m/DEBUGGING/ ? 1 : 0, # 4: DEBUGGING?
$self->_count_ops($_), # 5: Number of ops (||, &&)
]
} @_;
my %seen;
#start-no-tidy
@terms= map { $seen{ $_->[0] }++ ? () : $_->[0] }
sort {
$a->[5] <=> $b->[5] || # least number of ops
$b->[2] <=> $a->[2] || # PERL_IN before others
$a->[3] <=> $b->[3] || # digits after others
$a->[4] <=> $b->[4] || # DEBUGGING after all else
$a->[1] cmp $b->[1] || # stripped caseless cmp
lc($a->[0]) cmp lc($b->[0]) || # caseless cmp
$a->[0] cmp $b->[0] || # exact cmp
0
} @terms;
#end-no-tidy
return @terms;
}
# normalize a condition expression by parsing it and then stringifying
# the parse tree.
sub tidy_cond {
my ($self, $expr)= @_;
my $ret= $self->{_tidy_cond_cache}{$expr} //= do {
$self->parse_expr($expr) if defined $expr;
my $text= $self->_pt_as_str();
$text;
};
$self->{last_expr}= $ret;
return $ret;
}
# convert a parse tree structure to a string recursively.
#
# Parse trees are currently made up of arrays, with the count
# of items in the object determining the type of op it represents.
# 1 argument: literal value of some sort.
# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree
# : ternary: 0 slot holds '?', 1 is an array holding three
# parse trees: cond, true, false
# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees
# : note, this is multigate for commutative operators like
# : "+", "*", "&&" and "||", so an expr
# : like "A && B && !C" would be represented as:
# : [ "&&", ["A"], ["B"], [ "!",["C"] ] ]
#
sub _pt_as_str {
my ($self, $node, $parent_op, $depth)= @_;
$node ||= $self->{parse_tree}
or confess "No parse tree?";
$depth ||= 0;
if (@$node == 1) {
# its a literal
return $node->[0];
}
elsif (@$node == 2) {
# is this a ternary or an unop?
if ($node->[0] eq '?') {
# ternary, the three "parts" are tucked away in
# an array in the payload slot
my $expr=
$self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? "
. $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : "
. $self->_pt_as_str($node->[1][2], "?", $depth + 1);
# stick parens on if this is a subexpression
$expr= "( " . $expr . " )" if $depth;
return $expr;
}
else {
if ( $node->[0] eq "!"
and @{ $node->[1] } == 2
and $node->[1][0] eq "!")
{
# normalize away !! in expressions.
return $self->_pt_as_str($node->[1][1], $parent_op, $depth);
}
# unop - the payload is a optree
return $node->[0]
. $self->_pt_as_str($node->[1], $node->[0], $depth + 1);
}
}
# if we get here we are dealing with a binary operator
# the nodes are not necessarily binary, as we "collect"
# the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D]
my ($op, @terms)= @$node;
# convert the terms to strings
@terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms;
# sort them to normalize the subexpression
my $expr=
join " $op ", $associative{$op}
? $self->_sort_terms(@terms)
: @terms;
# stick parens on if this is a subexpression
$expr= "( " . $expr . " )" if $depth and !$cmpop{$op};
# and we are done.
return $expr;
}
# Returns the precedence of an operator, returns 0 if there is no token
# or the next token is not an op, or confess if it encounters an op it does not
# know.
sub _precedence {
my $self= shift;
my $token= shift // return 0;
my $op= (ref $token ? $token->{op} : $token) // return 0;
return $precedence{$op} // confess "Unknown op '$op'";
}
# entry point into parsing the tokens, checks that we actually parsed everything
# and didnt leave anything in the token stream (possible from a malformed expression)
# Performs some minor textual cleanups using regexes, but then does a proper parse
# of the expression.
sub parse_expr {
my ($self, $expr)= @_;
if (defined $expr) {
$expr =~ s/\\\n//g;
$expr =~ s/\bdefined\s+\(/defined(/g;
$expr =~ s/\bdefined\s+(\w+)/defined($1)/g;
$self->_tokenize_expr($expr);
}
my $ret= $self->_parse_expr();
if (@{ $self->{tokens} }) {
# if all was well with parsing we should not get here.
confess "Unparsed tokens: ", $self->dd($self->{tokens});
}
$self->{parse_tree}= $ret;
return $ret;
}
# this is just a wrapper around _parse_expr_assoc() which handles
# parsing an arbitrary expression.
sub _parse_expr {
my ($self)= @_;
return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1);
}
# This handles extracting from the token stream
# - simple literals
# - unops (assumed to be right associative)
# - parens (which reset the precedence acceptable to the parser)
#
sub _parse_expr_primary {
my ($self)= @_;
my $tokens= $self->{tokens}
or confess "No tokens in _parse_expr_primary?";
my $first= $tokens->[0]
or confess "No primary?";
if ($first->{paren} and $first->{paren} eq "(") {
shift @$tokens;
my $expr= $self->_parse_expr();
$first= $tokens->[0];
if (!$first->{paren} or $first->{paren} ne ")") {
confess "Expecting close paren", $self->dd($tokens);
}
shift @$tokens;
return $expr;
}
elsif ($first->{op} and $unop{ $first->{op} }) {
my $op_token= shift @$tokens;
return [ $op_token->{op}, $self->_parse_expr_primary() ];
}
elsif (defined $first->{literal}) {
shift @$tokens;
return [ $first->{literal} ];
}
else {
die sprintf
"Unexpected token '%s', expecting literal, unary, or expression.\n",
$first->{term};
}
}
# This is the heart of the expression parser. It uses
# a pair of nested loops to avoid excessive recursion during parsing,
# which should be a bit faster than other strategies. It only should
# recurse when the precedence level changes.
sub _parse_expr_assoc {
my ($self, $lhs, $min_precedence)= @_;
my $tokens= $self->{tokens}
or confess "No tokens in _parse_expr_assoc";
my $la= $tokens->[0]; # lookahead
my $la_pr= $self->_precedence($la); # lookahead precedence
while ($la && $la_pr >= $min_precedence) {
my $op_token= shift @$tokens;
my $op_pr= $la_pr; # op precedence
if ($op_token->{op} eq "?") {
my $mid= $self->_parse_expr();
if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") {
shift @$tokens;
my $tail= $self->_parse_expr();
return [ '?', [ $lhs, $mid, $tail ] ];
}
confess "Panic: expecting ':'", $self->dd($tokens);
}
my $rhs;
eval { $rhs= $self->_parse_expr_primary(); }
or die "Error in $op_names{$op_token->{op}} expression: $@";
$la= $tokens->[0];
$la_pr= $self->_precedence($la);
while (
$la_pr > $op_pr || # any and larger
( $is_right_assoc{ $op_token->{op} }
and $la_pr == $op_pr) # right and equal
) {
my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0);
$rhs= $self->_parse_expr_assoc($rhs, $new_precedence);
$la= $tokens->[0];
$la_pr= $self->_precedence($la);
}
if ( @$lhs >= 3
&& $lhs->[0] eq $op_token->{op}
&& $commutative{ $op_token->{op} })
{
push @$lhs, $rhs;
}
else {
my @lt= ($lhs);
my @rt= ($rhs);
# if we have '( a && b ) && ( c && d)'
# turn it into 'a && b && c && d'
if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) {
(undef,@lt)= @$lhs; # throw away op.
}
if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) {
(undef,@rt)= @$rhs; # throw away op.
}
$lhs= [ $op_token->{op}, @lt, @rt ];
}
}
return $lhs;
}
#entry point for normalizing and if/elif statements
#returns the line and condition in normalized form.
sub normalize_if_elif {
my ($self, $line, $line_info)= @_;
if (my $dat= $self->{cache_normalize_if_elif}{$line}) {
return $dat->{line}, $dat->{cond};
}
my ($cond);
eval {
($line, $cond)= $self->_normalize_if_elif($line);
1;
} or die sprintf "Error at line %d\nLine %d: %s\n%s",
($line_info->start_line_num()) x 2, $line, $@;
$self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond };
return ($line, $cond);
}
#guts of the normalize_if_elif() - cleans up the line, extracts
#the condition, and then tidies it with tidy_cond().
sub _normalize_if_elif {
my ($self, $line)= @_;
my $nl= "";
$nl= $1 if $line =~ s/(\n+)\z//;
$line =~ s/\s+\z//;
my @comment;
push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!;
$line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g;
$line =~ s/!\s+defined/!defined/g;
if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) {
my $if= $1;
my $not= $2 ? "!" : "";
$line= "#$if ${not}defined($3)";
}
$line =~ s/#((?:el)?if)\s+//
or confess "Bad cond: $line";
my $if= $1;
$line =~ s/!\s+/!/g;
my $old_cond= $line;
my $cond= $self->tidy_cond($old_cond);
warn "cond - $old_cond\ncond + $cond\n"
if $old_cond ne $cond and $self->{debug};
$line= "#$if $cond";
$line .= " " . join " ", reverse @comment if @comment;
$line .= $nl;
return ($line, $cond);
}
# parses a text buffer as though it was a file on disk
# calls parse_fh()
sub parse_text {
my ($self, $text)= @_;
local $self->{parse_source}= "(buffer)";
open my $fh, "<", \$text
or die "Failed to open buffer for read: $!";
return $self->parse_fh($fh);
}
# takes a readable filehandle and parses whatever contents is
# returned by reading it. Returns an array of HeaderLine objects.
# this is the main routing for parsing a header file.
sub parse_fh {
my ($self, $fh)= @_;
my @lines;
my @cond;
my @cond_line;
my $last_cond;
local $self->{parse_source}= $self->{parse_source} || "(unknown)";
my $cb= $self->{pre_process_content};
$self->{orig_content}= "";
my $line_num= 1;
while (defined(my $line= readline($fh))) {
my $start_line_num= $line_num++;
$self->{orig_content} .= $line;
while ($line =~ /\\\n\z/ or $line =~ m</(?:\\\n)*\*(?:(?!\*(?:\\\n)*/).)*\s*\z>s) {
defined(my $read_line= readline($fh))
or last;
$self->{orig_content} .= $read_line;
$line_num++;
$line .= $read_line;
}
while ($line =~ m!/(?:\\\n)*\*(.*?)(\*(?:\\\n)*/|\z)!gs) {
my ($inner, $tail)= ($1, $2);
if ($tail eq "") {
confess
"Unterminated comment starting at line $start_line_num\n";
}
elsif ($inner =~ m!/(?:\\\n)*\*!) {
confess
"Nested/broken comment starting at line $start_line_num\n";
}
}
my $raw= $line;
my $type= "content";
my $sub_type= "text";
my $level= @cond;
my $do_pop= 0;
my $flat= $line;
$flat =~ s/\\\n//g;
$flat =~ s!/\*.*?\*/! !gs;
$flat =~ s/\s+/ /g;
$flat =~ s/\s+\z//;
$flat =~ s/^\s*#\s*/#/g;
my $line_info=
HeaderLine->new(start_line_num => $start_line_num, raw => $raw);
my $do_cond_line;
if ($flat =~ /^#/) {
if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) {
my $if= $1;
my $not= $2 ? "!" : "";
my $sym= $3;
$flat =~
s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/;
}
my $cond; # used in various expressions below
if ($flat =~ /^#endif\b/) {
if (!@cond) {
confess "Not expecting $flat";
}
$do_pop= 1;
$level--;
$type= "cond";
$sub_type= "#endif";
}
elsif ($flat =~ /^#if\b/) {
($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
push @cond, [$cond];
push @cond_line, $line_info;
$type= "cond";
$sub_type= "#if";
}
elsif ($flat =~ /^#elif\b/) {
if (!@cond) {
confess "No if for $flat";
}
$level--;
($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
$cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
$cond_line[-1]= $line_info;
push @{ $cond[-1] }, $cond;
$type= "cond";
$sub_type= "#elif";
}
elsif ($flat =~ /^#else\b/) {
if (!@cond) {
confess "No if for $flat";
}
$level--;
$cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
$cond_line[-1]= $line_info;
$type= "cond";
$sub_type= "#else";
}
elsif ($flat =~ /^#undef\b/) {
$type= "content";
$sub_type= "#undef";
}
elsif ($flat =~ /^#pragma\b/) {
$type= "content";
$sub_type= "#pragma";
}
elsif ($flat =~ /^#include\b/) {
$type= "content";
$sub_type= "#include";
}
elsif ($flat =~ /^#define\b/) {
$type= "content";
$sub_type= "#define";
}
elsif ($flat =~ /^#error\b/) {
$type= "content";
$sub_type= "#error";
}
elsif ($flat =~ /^#\s*\z/) {
# deal with the null directive
# see: https://en.cppreference.com/w/c/preprocessor
# and: https://stackoverflow.com/questions/35207515
$type= "content";
$sub_type= "text";
}
else {
confess "Do not know what to do with $line";
}
if ($type eq "cond") {
# normalize conditional lines
$line= $flat;
$last_cond= $line_info;
}
}
$line =~ s/\n?\z/\n/;
%$line_info= (
cond => copy_aoa(\@cond),
type => $type,
sub_type => $sub_type,
raw => $raw,
flat => $flat,
line => $line,
level => $level,
source => $self->{parse_source},
start_line_num => $start_line_num,
n_lines => $line_num - $start_line_num,
);
push @lines, $line_info;
if ($do_pop) {
$line_info->{inner_lines}=
$line_info->start_line_num - $cond_line[-1]->start_line_num;
pop @cond;
pop @cond_line;
}
if ($type eq "content" and $cb) {
$cb->($self, $lines[-1]);
}
}
if (@cond_line) {
my $msg= "Unterminated conditional block starting line "
. $cond_line[-1]->start_line_num();
$msg .=
" with last conditional operation at line "
. $last_cond->start_line_num()
if $cond_line[-1] != $last_cond;
confess $msg;
}
$self->{lines}= \@lines;
return \@lines;
}
# returns the last lines we parsed.
sub lines { $_[0]->{lines} }
# assuming a line looks like an embed.fnc entry parse it
# and normalize it, and create an EmbedLine object from it.
sub tidy_embed_fnc_entry {
my ($self, $line_data)= @_;
my $line= $line_data->{line};
return $line if $line =~ /^\s*:/; # Don't tidy comments
return $line unless $line_data->{type} eq "content"; # Nor #if-like
return $line unless $line =~ /\|/; # Nor non-entries
$line =~ s/\s*\\\n/ /g; # Embedded \n to blank
$line =~ s/\s+\z//; # No trailing white space
($line)= expand($line); # No tabs
# Remove any assertions, and save them. This must be done before the
# split because the assertions can contain '|'
$line =~ s/ \b ( assert \s* \( .* ) \z //x;
my $assertions = $1;
# Split into fields
my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line;
# Sort and remove duplicate flags. Alpha flags are sorted first
my %flag_seen;
$flags = join "", grep !$flag_seen{$_}++,
sort {
my $a_is_word = $a =~ /\w/;
my $b_is_word = $b =~ /\w/;
return $a cmp $b if $a_is_word == $b_is_word;
return -1 if $a_is_word;
return 1;
} split //, $flags;
if ($flags eq "#") { # Could be an attempt at a conditional
die "Not allowed to use only '#' for flags"
. "in 'embed.fnc' at line $line_data->{start_line_num}";
}
if (!$flags) {
die "Missing flags in function definition"
. " in 'embed.fnc' at line $line_data->{start_line_num}\n"
. "Did you a forget a line continuation on the previous line?\n";
}
# Normalize the return type and arguments
for ($ret, @args) {
s/(\w)\*/$1 */g;
s/\*\s+(\w)/*$1/g;
s/\*const/* const/g;
}
# Start the output; right justify
my $head= sprintf "%-8s|%-7s", $flags, $ret;
$head .= sprintf "|%*s", -(31 - length($head)), $name;
# Start first argument on next line if $head already extends too far to
# the right
if (@args and length($head) > 32) {
$head .= "\\\n";
$head .= " " x 32;
}
# Add each argument on a separate line
foreach my $ix (0 .. $#args) {
my $arg= $args[$ix];
$head .= "|$arg";
$head .= "\\\n" . (" " x 32) if $ix < $#args;
}
my @assertions;
if ($assertions) {
# Put each assertion into a separate array element
@assertions = split / \s* assert \s* \( /x, $assertions;
shift @assertions; # The split leaves an empty first element
# Trim each assertion, including any trailing semicolon
foreach my $this_assertion (@assertions) {
$this_assertion =~ s/ ^ \s+ //x;
$this_assertion =~ s/ \s+ \z //x;
$this_assertion =~ s/ ; \z //x;
# Restore split delimitter
$this_assertion = "assert($this_assertion";
# Each assertion is on a separate line (for now, anyway)
$head .= "\\\n" . (" " x 32);
$head .= $this_assertion;
}
}
$line= $head . "\n";
# Make all lines in this entry the same length; minimum 72
if ($line =~ /\\\n/) {
my @lines= split /\s*\\\n/, $line;
my $len= length($lines[0]);
$len < length($_) and $len= length($_) for @lines;
$len= int(($len + 7) / 8) * 8;
$len= 72 if $len < 72;
$line= join("\\\n",
(map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]),
$lines[-1]);
}
($line)= unexpand($line); # Back to using tabs
$line_data->{embed}= EmbedLine->new(
flags => $flags,
return_type => $ret,
name => $name,
args => \@args,
assertions => \@assertions,
start_line_num => $line_data->{start_line_num},
);
$line =~ s/\s+\z/\n/;
$line_data->{line}= $line;
return $line;
}
# line up the text in a multiline string by a given $fragment
# of text, inserting whitespace in front or behind the $fragment
# to get the text to line up. Returns the text. This is wrapped
# by line_up() and is used to wrap long conditions and comments
# in the generated code.
sub _line_up_frag {
my ($self, $str, $fragment)= @_;
die "has tabs?!" if $str =~ /\t/;
my @lines= split /\n/, $str;
my $changed= 1;
while ($changed) {
$changed= 0;
foreach my $ix (0 .. $#lines - 1) {
my $f_index= 0;
my $n_index= 0;
while (1) {
$f_index= index($lines[$ix], $fragment, $f_index);
$n_index= index($lines[ $ix + 1 ], $fragment, $n_index);
if ($f_index == -1 or $n_index == -1) {
last;
}
if ($f_index < $n_index) {
my $f_idx= $f_index;
$f_idx-- while substr($lines[$ix], $f_idx, 1) ne " ";
substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index));
$changed++;
last;
}
elsif ($n_index < $f_index) {
my $n_idx= $n_index;
$n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " ";
substr($lines[ $ix + 1 ],
$n_idx, 0, " " x ($f_index - $n_index));
$changed++;
last;
}
$f_index++;
$n_index++;
}
}
}
my $ret= join "", map { "$_\n" } @lines;
return $ret;
}
sub _fixup_indent {
my ($self, $line)= @_;
my @lines= split /\n/, $line;
if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) {
my $first_left_len = length $1;
while (1) {
my $ok = 1;
for (@lines) {
/^.{$first_left_len} /
or do { $ok = 0; last; };
}
if ($ok) {
s/^(.{$first_left_len}) /$1/ for @lines;
} else {
last;
}
}
}
if ($lines[0]=~/^(#\s*\w+\s+)\(/) {
my $len = length($1);
for my $idx (1..$#lines) {
$lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/;
}
}
my $ret= join "", map { "$_\n" } @lines;
return $ret;
}
# this is the workhorse for _break_line_at_op().
sub __break_line_at_op {
my ($self, $limit, $line, $blank_prefix)= @_;
my @lines= ("");
while (length $line) {
my $part;
if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) {
$part= $1;
}
else {
$part= $line;
$line= "";
}
if (length($lines[-1]) + length($part) < $limit) {
$lines[-1] .= $part;
}
else {
push @lines, $blank_prefix . $part;
}
}
return \@lines;
}
# Break a condition line into parts, while trying to keep the last
# token on each line being an operator like || or && or ? or : We try
# to keep each line at $limit characters, however, we also try to
# ensure that each line has the same number of operators on it such
# that across all the lines there are only two counts of operators (eg,
# we either way each line to have two operators on it, or 0, or 1 or 0,
# or 2 or 1, and so on.) If we cannot meet this requirement we reduce
# the limit by 1 and try again, until we meet the objective, or the
# limit ends up at 70 chars or less.
sub _break_line_at_op {
my ($self, $limit, $line, $blank_prefix)= @_;
my $lines;
while (1) {
$lines= $self->__break_line_at_op($limit, $line, $blank_prefix);
my %op_counts;
foreach my $line_idx (0 .. $#$lines) {
my $line= $lines->[$line_idx];
my $count= 0;
$count++ while $line =~ /(\|\||&&|\?|:)/g;
$op_counts{$count}++;
}
if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) {
last;
}
$limit--;
}
s/\s*\z/\n/ for @$lines;
return join "", @$lines;
}
sub _max { # cant use Scalar::Util so we roll our own
my $max= shift;
$max < $_ and $max= $_ for @_;
return $max;
}
# take a condition, split into $type and $rest
# wrap it, and try to line up operators and defined() functions
# that it contains. This is rather horrible code, but it does a
# reasonable job applying the heuristics we need to lay our the
# conditions in a reasonable way.
sub _wrap_and_line_up_cond {
my ($self, $type, $rest)= @_;
my $limit= $self->{max_width};
# extract the expression part of the line, and normalize it, we do
# this here even though it might be duplicative as it is possible
# that the caller code has munged the expression in some way, and we
# might want to simplify the expression first. Eg:
# 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into
# 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible.
my $rest_head= "";
my $rest_tail= "";
if ($rest =~ s!(if\s+)!!) {
$rest_head= $1;
}
if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) {
$rest_tail= $1;
}
if ($rest) {
$rest= $self->tidy_cond($rest);
$rest= $rest_head . $rest . $rest_tail;
}
my $l= length($type);
my $line= $type;
$line .= $rest if length($rest);
my $blank_prefix= " " x $l;
# at this point we have a single line with the entire expression on it
# if it fits on one line we are done, we can return it right away.
if (length($line) <= $limit) {
$line =~ s/\s*\z/\n/;
return $line;
}
my $rest_copy= $rest;
my @fragments;
my $op_pat= qr/(?:\|\||&&|[?:])/;
# does the $rest contain a parenthesized group? If it does then
# there are a mixture of different ops being used, as if it was all
# the same opcode there would not be a parenthesized group.
# If it does then we handle it differently, and try to put the
# different parts of the expression on their own line.
if ($rest_copy =~ /$op_pat\s*\(/) {
my @parts;
while (length $rest_copy) {
if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) {
push @parts, $1;
} else {
#$rest_copy=~s/^\s+//;
push @parts, $rest_copy;
last;
}
}
$parts[0]= $type . $parts[0];
$parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts;
foreach my $line (@parts) {
if (length($line) > $limit) {
$line= $self->_break_line_at_op($limit, $line, $blank_prefix);
}
}
s/\s*\z/\n/ for @parts;
$line= join "", @parts;
@fragments= ("defined", "||");
}
else {
# the expression consists of just one opcode type, so we can use
# simpler logic to break it apart with the objective of ensuring
# that the lines are similarly formed with trailing operators on
# each line but the last.
@fragments= ("||", "defined");
$line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix);
}
# try to line up the text on different lines. We stop after
# the first $fragment that modifies the text. The order
# of fragments we try is determined above based on the type
# of condition this is.
my $pre_line= $line;
foreach my $fragment (@fragments) {
$line= $self->_line_up_frag($line, $fragment);
last if $line ne $pre_line;
}
# if we have lined up by "defined" in _line_up_frag()
# then we may have " || defined(...)" type expressions
# convert these to " || defined(...)" as it looks better.
$line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g;
$line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g;
# add back the line continuations. this is all pretty inefficient,
# but it works nicely.
my @lines= split /\n/, $line;
my $last= pop @lines;
my $max_len= _max(map { length $_ } @lines);
$_= sprintf "%*s \\\n", -$max_len, $_ for @lines;
$last .= "\n";
$line= join "", @lines, $last;
# remove line continuations that are inside of a comment,
# we may have a variable number of lines of the expression
# or parts of lines of the expression in a comment, so
# we do this last.
$line =~ s!/\* (.*) \*/
!"/*"._strip_line_cont("$1")."*/"!xsge;
return $self->_fixup_indent($line);
}
#remove line continuations from the argument.
sub _strip_line_cont {
my ($string)= @_;
$string =~ s/\s*\\\n/\n/g;
return $string;
}
# Takes an array of HeaderLines objects produced by parse_fh()
# or by group_content(), and turn it into a string.
sub lines_as_str {
my ($self, $lines, $post_process_content)= @_;
$lines ||= $self->{lines};
my $ret;
$post_process_content ||= $self->{post_process_content};
my $filter= $self->{filter_content};
my $last_line= "";
#warn $self->dd($lines);
foreach my $line_data (@$lines) {
my $line= $line_data->{line};
my $is_define = $line_data->is_define();
if (
$line_data->{type} ne "content"
or $line_data->{sub_type} ne "text"
or $is_define
)
{
my $level= $line_data->{level};
my $ind= $self->indent_chars($level);
if ($self->{indent_define} and $self->{hug_define} and $is_define) {
$line =~ s/^\s*#(\s*)/$ind#/;
} elsif (!$is_define or $self->{indent_define}) {
$line =~ s/^\s*#(\s*)/#$ind/;
}
}
if ($line_data->{type} eq "cond") {
my $add_commented_expr_after= $self->{add_commented_expr_after};
if ($line_data->{sub_type} =~ /#(?:else|endif)/) {
my $joined= join " && ",
map { "($_)" } @{ $line_data->{cond}[-1] };
my $cond_txt= $self->tidy_cond($joined);
$cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else";
$line =~ s!\s*\z! /* $cond_txt */\n!
if ($line_data->{inner_lines}||0) >= $add_commented_expr_after;
}
elsif ($line_data->{sub_type} eq "#elif") {
my $last_frame= $line_data->{cond}[-1];
my $joined= join " && ",
map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ];
my $cond_txt= $self->tidy_cond($joined);
$line =~ s!\s*\z! /* && $cond_txt */\n!
if ($line_data->{inner_lines}||0) >= $add_commented_expr_after;
}
}
$line =~ s/\s*\z/\n/;
if ($last_line eq "\n" and $line eq "\n") {
next;
}
$last_line= $line;
if ($line_data->{type} eq "cond") {
$line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z!
or die "Failed to split cond line: $line";
my ($type, $cond, $comment)= ($1, $2, $3);
$comment //= "";
$cond //= "";
my $new_line;
if (!length($cond) and $comment) {
$comment =~ s!^(/\*\s+)!!
and $type .= $1;
}
$line= $self->_wrap_and_line_up_cond($type, $cond . $comment);
}
$line_data->{line}= $line;
if ($post_process_content and $line_data->{type} eq "content") {
$post_process_content->($self, $line_data);
}
if ($filter and $line_data->{type} eq "content") {
$filter->($self, $line_data) or next;
}
$ret .= $line_data->{line};
}
return $ret;
}
# Text::Wrap::wrap has an odd api, so hide it behind a wrapper
# sub which sets things up properly.
sub _my_wrap {
my ($head, $rest, $line)= @_;
local $Text::Wrap::unexpand= 0;
local $Text::Wrap::huge= "overflow";
local $Text::Wrap::columns= 78;
unless (length $line) { return $head }
$line= wrap $head, $rest, $line;
return $line;
}
# recursively extract the && expressions from a parse tree,
# returning the result as strings.
# if $node is not a '&&' op then it returns $node as a string,
# otherwise it returns the string form of the arguments to the
# '&&' op, recursively flattening any '&&' nodes that it might
# contain.
sub _and_clauses {
my ($self, $node)= @_;
my @ret;
if (@$node < 3 or $node->[0] ne "&&") {
return $self->_pt_as_str($node);
}
foreach my $idx (1 .. $#$node) {
push @ret, $self->_and_clauses($node->[$idx]);
}
return @ret;
}
# recursively walk the a parse tree, and return the literal
# terms it contains, ignoring any operators in the optree.
sub _terms {
my ($self, $node)= @_;
if (@$node == 1) {
return $self->_pt_as_str($node);
}
my @ret;
if (@$node == 2) {
if ($node->[0] eq "?") {
push @ret, map { $self->_terms($_) } @{ $node->[1] };
}
else {
push @ret, $self->_terms($node->[1]);
}
}
else {
foreach my $i (1 .. $#$node) {
push @ret, $self->_terms($node->[$i]);
}
}
return @ret;
}
# takes a HeaderLine "cond" AoA and flattens it into
# a single expression, and then extracts all the and clauses
# it contains. Thus [['defined(A)'],['defined(B)']] and
# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)']
sub _flatten_cond {
my ($self, $cond_ary)= @_;
my $expr= join " && ", map {
map { "($_)" }
@$_
} @$cond_ary;
return [] unless $expr;
my $tree= $self->parse_expr($expr);
my %seen;
my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree);
return \@and_clause;
}
# Find the best path into a tree of conditions, such that
# we reuse the maximum number of existing branches. Returning
# two arrays, the first contain the parts of $cond_array that
# make up the best path, in the best path order, and a second array
# with the remaining items in the initial order they were provided.
# Thus if we have previously stored only the path "A", "B", "C"
# into the tree, and want to find the best path for
# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"],
#
# This is used to reduce the number of conditions in the grouped content,
# and is especially helpful with dealing with DEBUGGING related
# functionality. It is coupled with careful control over the order
# that we add paths and conditions to the tree.
sub _best_path {
my ($self, $tree_node, $cond_array, @path)= @_;
my $best= \@path;
my $rest= $cond_array;
foreach my $cond (@$cond_array) {
if ($tree_node->{$cond}) {
my ($new_best, $new_rest)=
$self->_best_path($tree_node->{$cond},
[ grep $_ ne $cond, @$cond_array ],
@path, $cond);
if (@$new_best > @$best) {
($best, $rest)= ($new_best, $new_rest);
}
}
}
if (@$best == @path) {
foreach my $cond (@$cond_array) {
my $not_cond= $self->tidy_cond("!($cond)");
if ($tree_node->{$not_cond}) {
$best= [ @path, $cond ];
$rest= [ grep $_ ne $cond, @$cond_array ];
last;
}
}
}
return ($best, $rest);
}
sub HeaderLine::reduce_conds {
# Reduce the preprocessor conditionals that guard the HeaderLine $self
# object, given a hash that says the values of certain preprocessor
# conditions, and a corresponding regular expression pattern that matches
# any of those conditions.
#
# This currently returns 0 if the conditionals as a whole evaluate to
# false; and 1 if they might evaluate to true.
my ($self, $constraints_pat, $constraints_ref) = @_;
# Short cut the common case of there being no conditions in effect.
return 1 if $self->{cond}->@* == 0;
# We need a copy so we don't destroy the input.
my @cond = copy_aoa($self->{cond});
return _reduce_conds(\@cond, $constraints_pat, $constraints_ref);
}
sub _reduce_conds {
my ($cond_ref, $constraints_pat, $constraints_ref, $recursed) = @_;
# This does the heavy lifting for HeaderLine::reduce_conds(). It
# recursively descends the array $cond_ref of conditions. These have been
# normalized by HeaderParser so that each element is ANDed with all the
# other ones. Hence if any element evaluates to false, the whole array
# must.
#
# Each leaf node will have a series of conditionals (typically linked by
# '||') , each of which has been normalized by HeaderParser to look like
# either of;
# defined(foo)
# !defined(foo)
# All the ones of these that match $constraints_pat are substituted by
# their corresponding value in %constraints_ref, which will be either 0 or
# 1. The result is looked at to see if it has to evaluate to 0 or not.
# Any term whose definedness isn't known is left as-is, so the result of
# this function is a string of those, with the known ones removed. In
# many cases, given the typical inputs we have, the result will be reduced
# to just '0' or '1'.
# At leaf nodes, just change all defined(foo) whose values of foo are
# known to be those values, reducing them to "#if 0" or "#if 1". The
# unknown values are left as-is, that is, as strings
if (ref $cond_ref ne "ARRAY") {
die "Expecting an array at top-level call" unless $recursed;
$cond_ref =~ s/$constraints_pat/$constraints_ref->{$1}/g;
reduce_ones_and_zeros(\$cond_ref);
return $cond_ref;
}
# Here, the conditions are in an array. We recurse to handle each
# element of the array.
my $return = "";
for (my $i = 0; $i < $cond_ref->@*; $i++) {
my $cond = $cond_ref->[$i];
my $this_return = _reduce_conds($cond,
$constraints_pat,
$constraints_ref,
($recursed // 0) + 1 # is recursed
);
# Each element is ANDed with the others; so if this is 0, the result
# will also be 0.
return $this_return if $this_return eq '0';
if ($return =~ / ^ 1? $ /x) {
# Here, value so far contributes nothing to the final result;
# replace it with the new one.
$return = $this_return;
}
elsif ($this_return ne '1') {
# When $this_return is 1, it contributes nothing; so do nothing.
# Otherwise we will have to AND this new value with what exists.
# Parentheses are needed if this contains lower precedence '||'
$return .= " && ";
if ($this_return =~ /\Q||/) {
$return .= "( $this_return )";
}
else {
$return .= $this_return;
}
}
}
# Didn't do an early return. Nothing conclusively evaluated to 0.
reduce_ones_and_zeros(\$return);
return $return;
}
sub reduce_ones_and_zeros {
my $ref = shift;
# This reduces as much as possible a symbolic C preprocessor expression
# that hopefully has terms that are either 0 or 1. We know that anything
# ANDed with 0 is 0 and anything OR'd with 1 is 1, for example. By
# repeating these and other rules, until nothing more changes, we reduce
# it as much as possible, given what is known. In many cases that this is
# called in, the value gets down to simply 0 or 1.
#
# This is simplistic, not knowing about precedence, for example. khw took
# it from Devel::PPPort. But it works well enough. HeaderParser could be
# enlisted to make it work better.
my $any_changed = 0;
if (ref $ref eq 'ARRAY') {
foreach my $element ($ref->@*) {
$any_changed |= reduce_ones_and_zeros(\$element)
if $element =~ / ^ # \s* (?: if | el ) /x;
}
return $any_changed;
}
my $changed;
do {
$changed = 0;
# (0) -> 0
$changed |= $$ref =~ s/ \( \s* 0 \s* \) /0/xg;
# !0 -> 1
$changed |= $$ref =~ s/ ! \s* 0 \b /1/xg;
# '|| 0 ||' -> ||
$changed |= $$ref =~ s/ \s* \|\| \s* 0 \s* \|\| /||/xg;
# '^ 0 || foo' -> foo
# '(0 || foo' -> (foo
$changed |= $$ref =~ s/ ^ \s* 0 \s* \|\| \s* //xg;
$changed |= $$ref =~ s/ \( \s* 0 \s* \|\| \s* /(/xg;
# 'foo || 0 ) ' -> foo
# 'foo || 0 $ ' -> foo
$changed |= $$ref =~ s/ \s* \|\| \s* 0 \s* (?= $ | \) ) //xg;
# '^ 0 && foo' doesn't work because of precedence: '0 && anything || 1'
# Similarly for 'foo && 0 $'
# (1) -> 1
$changed |= $$ref =~ s/ \( \s* 1 \s* \) /1/xg;
# !1 -> 0
$changed |= $$ref =~ s/ ! \s* 1 \b /0/xg;
# '&& 1 &&' -> &&
$changed |= $$ref =~ s/ \s* && \s* 1 \s* && /&&/xg;
# '^ 1 && foo' -> foo
# '^ 1 || foo' -> 1 # Works cause || lower precedence than &&
$changed |= $$ref =~ s/ ^ \s* 1 \s* && \s* //xg;
#$changed |= $$ref =~ s/ ^ \s* 1 \s* \|\| .* /1/xg;
# '(1 && foo' -> (foo
$changed |= $$ref =~ s/ \( \s* 1 \s* && \s* /(/xg;
# 'foo && 1 [ )$ ] ' -> foo
$changed |= $$ref =~ s/ \s* && \s* 1 \s* (?= $ | \) ) //xg;
# There are other things that could be reduced, but looking for
# just the defined(foo) case doesn't involve fancy parsing,
# and catches just about everything
# 'defined(foo) && 0' -> 0
$changed |= $$ref
=~ s/ (?: ! \s*)? \b defined \s* \(\w+\)
\s* && \s* 0 \b /0/xg;
# 'defined(foo) || 1' -> 1
$changed |= $$ref
=~ s/ (?: ! \s*)? \b defined \s* \(\w+\)
\s* \|\| \s* 1 \b /1/xg;
# '0 && defined(foo)' -> 0
$changed |= $$ref
=~ s/ \b 0 \s* && \s* (?: ! \s*)? defined
\s* \(\w+\) /0/xg;
# '1 || defined(foo)' -> 1
$changed |= $$ref
=~ s/ \b 1 \s* \|\| \s* (?: ! \s*)? defined
\s* \(\w+\) /1/xg;
$any_changed |= $changed;
} while ($changed);
return $any_changed;
}
# This builds a group content tree from a set of lines. each content line in
# the original file is added to the file based on the conditions that apply to
# the content.
#
# The tree is made up of nested HoH's with keys in the HoH being normalized
# clauses from the {cond} data in the HeaderLine objects.
#
# Care is taken to minimize the number of pathways and to reorder clauses to
# reuse existing pathways and minimize the total number of conditions in the
# file.
#
# The '' key of a hash contains an array of the lines that are part of the
# condition that lead to that key. Thus lines with no conditions are in
# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be
# in $tree{"defined(A)"}{"defined(B)"}{""}.
#
# The result of this sub is normally passed into __recurse_group_content_tree()
# which converts it back into a set of HeaderLine objects.
#
sub _build_group_content_tree {
my ($self, $lines)= @_;
$lines ||= $self->{lines};
my $filter= $self->{filter_content};
my %seen_normal;
foreach my $line_data (@$lines) {
next if $line_data->{type} ne "content";
next if $filter and !$filter->($self, $line_data);
my $cond_frames= $line_data->{cond};
my $cond_frame= $self->_flatten_cond($cond_frames);
my $flat_merged= join " && ", map "($_)", @$cond_frame;
my $normalized;
if (@$cond_frame) {
$normalized= $self->tidy_cond($flat_merged);
}
else {
$normalized= $flat_merged; # empty string
}
push @{ $seen_normal{$normalized} }, $line_data;
}
my @debugging;
my @non_debugging;
foreach my $key (keys %seen_normal) {
if ($key =~ /DEBUGGING/) {
push @debugging, $key;
}
else {
push @non_debugging, $key;
}
}
@non_debugging=
sort { length($a) <=> length($b) || $a cmp $b } @non_debugging;
@debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging;
my %tree;
foreach my $normal_expr (@non_debugging, @debugging) {
my $all_line_data= $seen_normal{$normal_expr};
my $cond_frame=
(length $normal_expr)
? $self->_flatten_cond([ [$normal_expr] ])
: [];
@$cond_frame= $self->_sort_terms(@$cond_frame);
my $node= \%tree;
my ($best, $rest)= $self->_best_path($node, $cond_frame);
die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame
unless @$best + @$rest == @$cond_frame;
foreach my $cond (@$best, @$rest) {
$node= $node->{$cond} ||= {};
}
push @{ $node->{''} }, @$all_line_data;
}
warn $self->dd(\%tree) if $self->{debug};
$self->{tree}= \%tree;
return \%tree;
}
sub _recurse_group_content_tree {
my ($self, $node, @path)= @_;
my @ret;
local $self->{rgct_ret}= \@ret;
local $self->{line_by_depth}= [];
$self->__recurse_group_content_tree($node, @path);
return \@ret;
}
# convert a tree of conditions constructed by _build_group_content_tree()
# and turn it into a set of HeaderLines that represents it. Performs the
# appropriate sets required to reconstitute an if/elif/elif/else sequence
# by calling _handle_else().
sub __recurse_group_content_tree {
my ($self, $node, @path)= @_;
my $depth= 0 + @path;
my $ind= $self->indent_chars($depth);
my $ret= $self->{rgct_ret};
if ($node->{''}) {
if (my $cb= $self->{post_process_grouped_content}) {
$cb->($self, $node->{''}, \@path);
}
if (my $cb= $self->{post_process_content}) {
$cb->($self, $_, \@path) for @{ $node->{''} };
}
push @$ret, map {
HeaderLine->new(
%$_,
cond => [@path],
level => $depth,
start_line_num => 0 + @$ret
)
} @{ $node->{''} };
}
my %skip;
foreach my $expr (
map { $_->[0] }
sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] }
map { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node
) {
next unless length $expr; # ignore payload
my $not= $self->tidy_cond("!($expr)");
if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) {
next;
}
my $kid= $node->{$expr};
while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) {
my ($kid_key)= keys(%$kid);
$expr= $self->tidy_cond("($expr) && ($kid_key)");
$kid= $kid->{$kid_key};
my $new_not= $self->tidy_cond("!($expr)");
if ($node->{$new_not}) {
$not= $new_not;
$skip{$not}++;
}
}
my $raw= "#${ind}if $expr\n";
my $hl= HeaderLine->new(
type => "cond",
sub_type => "#if",
raw => $raw,
line => $raw,
level => $depth,
cond => [ @path, [$expr] ],
start_line_num => 0 + @$ret,
);
$self->{line_by_depth}[$depth]= 0 + @$ret;
push @$ret, $hl;
$self->__recurse_group_content_tree($kid, @path, [$expr]);
if ($node->{$not}) {
$skip{$not}++;
$self->_handle_else($not, $node->{$not}, $ind, $depth, @path,
[$not]);
}
# and finally the #endif
$raw= "#${ind}endif\n";
# we need to extract the condition information from the last line in @ret,
# as we don't know which condition we are ending here. It could be an elsif
# from deep in the parse tree for instance.
# So we need to extract the last frame from the cond structure in the last
# line-info in @ret.
# BUT if this last line is itself an #endif, then we need to take the second
# to last line instead, as the endif would have "popped" that frame off the
# condition stack.
my $last_ret= $ret->[-1];
my $idx=
($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif")
? -2
: -1;
my $end_line= HeaderLine->new(
type => "cond",
sub_type => "#endif",
raw => $raw,
line => $raw,
level => $depth,
cond => [ @path, $last_ret->{cond}[$idx] ],
start_line_num => 0 + @$ret,
inner_lines => @$ret - $self->{line_by_depth}[$depth],
);
undef $self->{line_by_depth}[$depth];
push @$ret, $end_line;
}
return $ret;
}
# this handles the specific case of an else clause, detecting
# when an elif can be constructed, may recursively call itself
# to deal with if/elif/elif/else chains. Calls back into
# __recurse_group_content_tree().
sub _handle_else {
my ($self, $not, $kid, $ind, $depth, @path)= @_;
# extract the first 3 keys - from this we can detect
# which of the three scenarios we have to handle.
my ($k1, $k2, $k3)=
sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
my $not_k1;
if (length($k1) and defined($k2) and !defined($k3)) {
# if we do not have a payload (length($k1)) and we have exactly
# two keys (defined($k2) and !defined($k3)) we need to compute
# the inverse of $k1, which we will use later.
$not_k1= $self->tidy_cond("!($k1)");
}
my $ret= $self->{rgct_ret};
if (length($k1) and !defined($k2)) {
# only one child, no payload -> elsif $k1
my $sub_expr;
do {
$sub_expr=
!$sub_expr
? $k1
: $self->tidy_cond("($sub_expr) && ($k1)");
$kid= $kid->{$k1};
($k1, $k2)=
sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
} while length($k1) and !defined $k2;
my $raw= "#${ind}elif $sub_expr\n";
push @{ $path[-1] }, $sub_expr;
my $hl= HeaderLine->new(
type => "cond",
sub_type => "#elif",
raw => $raw,
line => $raw,
level => $depth,
cond => [ map { [@$_] } @path ],
start_line_num => 0 + @$ret,
inner_lines => @$ret - $self->{line_by_depth}[$depth],
);
$self->{line_by_depth}[$depth]= 0 + @$ret;
push @$ret, $hl;
$self->__recurse_group_content_tree($kid, @path);
}
elsif (defined($not_k1) and $not_k1 eq $k2) {
# two children which are complementary, no payload -> elif $k1 else..
my $raw= "#${ind}elif $k1\n";
push @{ $path[-1] }, $k1;
my $hl= HeaderLine->new(
type => "cond",
sub_type => "#elif",
raw => $raw,
line => $raw,
level => $depth,
cond => [ map { [@$_] } @path ],
start_line_num => 0 + @$ret,
inner_lines => @$ret - $self->{line_by_depth}[$depth],
);
$self->{line_by_depth}[$depth]= 0 + @$ret;
push @$ret, $hl;
$self->__recurse_group_content_tree($kid->{$k1}, @path);
$path[-1][-1]= $k2;
$self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path);
}
else {
# payload, 3+ children, or 2 which are not complementary -> else
my $raw= "#${ind}else\n";
my $hl= HeaderLine->new(
type => "cond",
sub_type => "#else",
raw => $raw,
line => $raw,
level => $depth,
cond => [ map { [@$_] } @path ],
start_line_num => 0 + @$ret,
inner_lines => @$ret - $self->{line_by_depth}[$depth],
);
$self->{line_by_depth}[$depth]= 0 + @$ret;
push @$ret, $hl;
$self->__recurse_group_content_tree($kid, @path);
}
return $ret;
}
# group the content in lines by the condition that apply to them
# returns a set of lines representing the new structure
sub group_content {
my ($self, $lines, $filter)= @_;
$lines ||= $self->{lines};
local $self->{filter_content}= $filter || $self->{filter_content};
my $tree= $self->_build_group_content_tree($lines);
return $self->_recurse_group_content_tree($tree);
}
#read a file by name - opens the file and passes the fh into parse_fh().
sub read_file {
my ($self, $file_name, $callback)= @_;
$self= $self->new() unless ref $self;
local $self->{parse_source}= $file_name;
open my $fh, "<", $file_name
or confess "Failed to open '$file_name' for read: $!";
my $lines= $self->parse_fh($fh);
if ($callback) {
foreach my $line (@$lines) {
$callback->($self, $line);
}
}
return $self;
}
# These are utility methods for the HeaderLine objects.
sub HeaderLine::new {
my ($class, %self)= @_;
return bless \%self, $class;
}
sub HeaderLine::cond { $_[0]->{cond} } # AoA
sub HeaderLine::type { $_[0]->{type} }
sub HeaderLine::type_is { return $_[0]->type eq $_[1] ? 1 : 0 }
sub HeaderLine::sub_type { $_[0]->{sub_type} }
sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 }
sub HeaderLine::raw { $_[0]->{raw} }
sub HeaderLine::flat { $_[0]->{flat} }
sub HeaderLine::line { $_[0]->{line} }
sub HeaderLine::level { $_[0]->{level} }
sub HeaderLine::is_content { return $_[0]->type_is("content") }
sub HeaderLine::is_cond { return $_[0]->type_is("cond") }
sub HeaderLine::is_define { return $_[0]->sub_type_is("#define") }
sub HeaderLine::line_num { $_[0]->{start_line_num} }
sub HeaderLine::inner_lines { $_[0]->{inner_lines} }
sub HeaderLine::n_lines { $_[0]->{n_lines} }
sub HeaderLine::embed { $_[0]->{embed} }
*HeaderLine::start_line_num= *HeaderLine::line_num;
# these are methods for EmbedLine objects
*EmbedLine::new= *HeaderLine::new;
sub EmbedLine::flags { $_[0]->{flags} }
sub EmbedLine::return_type { $_[0]->{return_type} }
sub EmbedLine::name { $_[0]->{name} }
sub EmbedLine::args { $_[0]->{args} } # array ref
sub EmbedLine::line_num { $_[0]->{start_line_num} }
1;
__END__
=head1 NAME
HeaderParser - A minimal header file parser that can be hooked by other porting
scripts.
=head1 SYNOPSIS
my $o= HeaderParser->new();
my $lines= $o->parse_fh($fh);
=head1 DESCRIPTION
HeaderParser is a tool to parse C preprocessor header files. The tool
understands the syntax of preprocessor conditions, and is capable of creating
a parse tree of the expressions involved, and normalizing them as well.
C preprocessor files are a bit tricky to parse properly, especially with a
"line by line" model. There are two issues that must be dealt with:
=over 4
=item Line Continuations
Any line ending in "\\\n" (that is backslash newline) is considered to be part
of a longer string which continues on the next line. Processors should delete
the "\\\n" early on when converting to a "real" line, before doing any further
parsing.
=item Comments Acting As A Line Continuation
The rules for header files stipulates that C style comments are stripped
before processing other content, this means that comments can serve as a form
of line continuation:
#if defined(foo) /*
*/ && defined(bar)
is the same as
#if defined(foo) && defined(bar)
This type of comment usage is often overlooked by people writing header file
parsers for the first time.
=item Indented preprocessor directives
It is easy to forget that there may be multiple spaces between the "#"
character and the directive. It also easy to forget that there may be spaces
in I<front> of the "#" character. Both of these cases are often overlooked.
=back
The main idea of this module is to provide a single framework for correctly
parsing the content of our header files in a consistent manner. A secondary
purpose it to make various tasks we want to do easier, such as normalizing
content or preprocessor expressions, or just extracting the real "content" of
the file properly.
=head2 new
Construct a new HeaderParser. Options are as follows
=over 4
=item add_commented_expr_after
Specifies the number of lines between conditional clause lines that will trigger
a comment being generated on the close of the clause that shows what expession
that close is for.
=item max_width
Maximum number of columns expected per line.
=item min_break_width
If a conditional clause is longer than this width HeaderParser will try to
rearrange its terms so that each line is not longer than this.
=item indent_define
Should #define clauses be indented when contained in a clause expression that
is indented, default is yes: 1.
=item hug_define
Should the # hug the define or not? When not set (the default) an indented #define
looks like this:
#if whatever
# define X
#endif
When set it looks like this:
#if whatever
#define X
#endif
That is the # is indented, and the define comes immediately afterwards.
=back
=head2 parse_fh
This function parses a filehandle into a set of lines. Each line is represented by a hash
based object which contains the following fields:
bless {
cond => [['defined(a)'],['defined(b)']],
type => "content",
sub_type => "#undef",
raw => $raw_content_of_line,
line => $normalized_content_of_line,
level => $level,
source => $filename_or_string,
start_line_num => $line_num_for_first_line,
n_lines => $line_num - $line_num_for_first_line,
}, "HeaderLine"
A "line" in this context is a logical line, and because of line continuations
and comments may contain more than one physical line, and thus more than
one newline, but will always include at least one and will always end with one
(unless there is no newline at the end of the file). Thus
before /*
this is a comment
*/ after \
and continues
will be treated as a single logical line even though the content is
spread over four lines.
=over 4
=item cond
An array of arrays containing the normalized expressions of any C preprocessor
conditional blocks which include the line. Each line has its own copy of the
conditions it was operated on currently, but that may change so dont alter
this data. The inner arrays may contain more than one element. If so then the
line is part of an "#else" or "#elsif" and the clauses should be considered to
be a conjunction when considering "when is this line included", however when
considered as part of an if/elsif/else, each added clause represents the most
recent condition. In the following you can see how:
before /* cond => [ ] */
#if A /* cond => [ ['A'] ] */
do-a /* cond => [ ['A'] ] */
#elif B /* cond => [ ['!A', 'B'] ] */
do-b /* cond => [ ['!A', 'B'] ] */
#else /* cond => [ ['!A', '!B'] ] */
do-c /* cond => [ ['!A', '!B'] ] */
# if D /* cond => [ ['!A', '!B'], ['D'] ] */
do-d /* cond => [ ['!A', '!B'], ['D'] ] */
# endif /* cond => [ ['!A', '!B'], ['D'] ] */
#endif /* cond => [ ['!A', '!B'] ] */
after /* cond => [ ] */
So in the above we can see how the three clauses of the if produce
a single "frame" in the cond array, but that frame "grows" and changes
as additional else clauses are added. When an entirely new if block
is started (D) it gets its own block. Each endif includes the clause
it terminates.
=item type
This value indicates the type of the line. This may either 'content' or
'cond'. The sub_type gives finer detail.
=item sub_type
This value gives more detail on the type of the line.
Type | Sub Type
--------+----------
content | text
| #include
| #define
| #error
| #pragma
| #undef
cond | #if
| #elif
| #else
| #endif
Note that there are no '#ifdef' or '#elifndef' or similar expressions. All
expressions of that form are normalized into the '#if defined' form to
simplify processing.
For all sub_types except C<#endif>, the C<cond> array gives the conditions
after the line is executed.
=item raw
This was the raw original text before HeaderParser performed any modifications
to it.
=item line
This is the normalized and modified text after HeaderParser or any callbacks
have processed it.
=item level
This is the "indent level" of a line and corresponds to the number of blocks
that the line is within, not including any blocks that might be created by
the line itself.
before /* level => 0 */
#if A /* level => 0 */
do-a /* level => 1 */
#elif B /* level => 0 */
do-b /* level => 1 */
#else /* level => 0 */
do-c /* level => 1 */
# if D /* level => 1 */
do-d /* level => 2 */
# endif /* level => 1 */
#endif /* level => 0 */
after /* level => 0 */
=back
parse_fh() will throw an exception if it encounters a malformed expression
or input it cannot handle.
=head2 lines_as_str
This function will return a string representation of the lines it is provided
via an array of HeaderLines objects produced by parse_fh() or by group_content()
=head2 group_content
This function will group the text in the file by the conditions which contain
it. This is only useful for files where the content is essentially a list and
where changing the order that lines are output in will not break the resulting
file.
Each content line will be grouped into a structure of nested if/else blocks
(elif will produce a new nested block) such that the content under the control
of a given set of normalized condition clauses are grouped together in the order
they occurred in the file, such that each combined conditional clause is output
only once.
This means a file like this:
#if A
A
#elif K
AK
#else
ZA
#endif
#if B && Q
B
#endif
#if Q && B
BC
#endif
#if A
AD
#endif
#if !A
ZZ
#endif
Will end up looking roughly like this:
#if A
A
AD
#else
ZZ
# if K
AK
# else
ZA
# endif
#endif
#if B && Q
B
BC
#endif
Content at a given block level always goes before conditional clauses
at the same nesting level.
=head2 HOOKS
There are severals hooks that are available, C<pre_process_content> and
C<post_process_content>, and C<post_process_grouped_content>. All of these
hooks will be called with the HeaderParser object as the first argument.
The "process_content" callbacks will be called with a line hash as the second
argument, and C<post_process_grouped_content> will be called with an
array of line hashes for the content in that group, so that the array may be
modified or sorted. Callbacks called from inside of C<group_content()>
(that is C<post_process_content> and C<post_process_grouped_content> will be
called with an additional argument containing an array specifying the actual
conditional "path" to the content (which may differ somewhat from the data in
a lines "cond" property).
These hooks may do what they like, but generally they will modify the
"line" property of the line hash to change the final output returned
by C<lines_as_str()> or C<group_content()>.
=head2 FORMATTING AND INDENTING
Header parser tries hard to produce neat and readable output with a consistent
style and form. For example:
#if defined(FOO)
# define HAS_FOO
# if defined(BAR)
# define HAS_FOO_AND_BAR
# else /* !defined(BAR) */
# define HAS_FOO_NO_BAR
# endif /* !defined(BAR) */
#endif /* defined(FOO) */
HeaderParser uses two space tab stops for indenting C preprocessor
directives. It puts the spaces between the "#" and the directive. The "#" is
considered "part" of the indent, even though the space comes after it. This
means the first indent level "looks" like one space, and following indents
look like 2. This should match what a sensible editor would do with two space
tab stops. The C<indent_chars()> method can be used to convert an indent level
into a string that contains the appropriate number of spaces to go in between
the "#" and the directive.
When emitting "#endif", "#elif" and "#else" directives comments will be
emitted also to show the conditions that apply. These comments may be wrapped
to cover multiple lines. Some effort is made to get these comments to line up
visually, but it uses heuristics which may not always produce the best result.
=cut