# /u/sy/beebe/bibnet/keywords.awk, Fri Jul  8 09:49:42 2005
# Edit by Nelson H. F. Beebe <beebe@math.utah.edu>
# Add zero penalty after slash to allow line breaks and reduce overfull boxes.
#
# /u/sy/beebe/tex/bib/keywords.awk, Sat Oct  5 16:40:04 1996
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
# Add argument declarations for a few local variables.
#
# /u/sy/beebe/tex/bib/keywords.awk, Sat Jan 27 09:07:48 1996
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
# Add \path to list of ignored words.
#
# /u/sy/beebe/tex/bib/keywords.awk, Mon Jan  8 18:59:18 1996
# Edit by Nelson H. F. Beebe <beebe@sundown.math.utah.edu>
# In simplify(), strip leading and trailing quotes.
#
# /u/sy/beebe/tex/bib/keywords.awk, Mon Jan  8 08:56:08 1996
# Edit by Nelson H. F. Beebe <beebe@sundown.math.utah.edu>
# In do_title(), strip leading and trailing quotes.  In strip_math(),
# protect non-math dollar signs before splitting.
#
# /u/sy/beebe/tex/bib/keywords.awk, Mon May 22 13:07:41 1995
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
# In brace_balance(), recognize backslashed braces
#
# /u/sy/beebe/tex/bib/keywords.awk, Sat Oct  1 18:19:30 1994
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
# In outword(), strip font changes from sortkey, and add two
# more fonts to ignore[] list.
#
# /u/sy/beebe/tex/bib/keywords.awk, Fri Sep 23 01:11:53 1994
# Edit by Nelson H. F. Beebe <beebe@sunrise>
# Update with selection of several key-specific indexes
#
# /u/sy/beebe/tex/bib/keywords.awk, Tue Mar 15 15:43:25 1994
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
# Add strip_escapes() to handle {\"word\"} in value strings
#
# /u/sy/beebe/tex/bib/keywords.awk, Mon Oct 11 19:35:58 1993
# Edit by Nelson H. F. Beebe <beebe@plot79.math.utah.edu>
#
# ======================================================================
# Filter the output of bibparse, producing a sorted list of quadruples
# of sort keys, title words, their citation tags, and their entry types,
# removing unimportant words from the list.  The output is suitable for
# input to fmtwords.awk for incorporation in a bibliography LaTeX
# wrapper file.
#
# Usage:
#    bibparse foo.bib | \
#		gawk -f keywords.awk | \
#		egrep 'entrytype$' | \
#		gawk -f fmtwords.awk >foo.ltx
#
# [05-Oct-1996]
# ======================================================================
BEGIN \
{
    FS = "\t"
    IGNORECASE = 1		# for gawk, not nawk
    # SORTPIPE = "sort -u -f -t'	' +0 -1 +2 -3"
    SORTPIPE = "sort -u -f -t'	' -k1,1 -k2,2"
    SORTPIPE = "sort -u -f -t'	' -k1,1 -k2,2 -k3,3"

    ## [22-Mar-2018] add patterns that produce bogus \cite{} command arguments
    ignore["/"] = 1
    ignore["-"] = 1
    ignore["\\slash"] = 1

    ignore["\\&"] = 1
    ignore["{\\&}"] = 1

    ignore["\\bf"] = 1
    ignore["\\em"] = 1
    ignore["\\sf"] = 1
    ignore["\\sl"] = 1
    ignore["\\tt"] = 1

    # The remaining ignore entries need only be in lower case and without
    # accents, since the sortkey is also matched against this list
    # Notice that the ignore list contains words from several languages
    # used in the UofUtah bibliography collection, and includes all the
    # words in the bibindex/biblook.h BADWORDS list.

    ignore["bf"] = 1
    ignore["em"] = 1
    ignore["it"] = 1
    ignore["rm"] = 1
    ignore["sc"] = 1
    ignore["sf"] = 1
    ignore["sl"] = 1
    ignore["tt"] = 1

    ignore["a"] = 1
    ignore["ab"] = 1
    ignore["aber"] = 1
    ignore["about"] = 1
    ignore["all"] = 1
    ignore["als"] = 1
    ignore["an"] = 1
    ignore["and"] = 1
    ignore["another"] = 1
    ignore["ar"] = 1
    ignore["are"] = 1
    ignore["around"] = 1
    ignore["as"] = 1
    ignore["at"] = 1
    ignore["au"] = 1
    ignore["auf"] = 1
    ignore["aus"] = 1
    ignore["avec"] = 1
    ignore["az"] = 1
    ignore["bei"] = 1
    ignore["bir"] = 1
    ignore["but"] = 1
    ignore["by"] = 1
    ignore["da"] = 1
    ignore["das"] = 1
    ignore["dat"] = 1
    ignore["de"] = 1
    ignore["dei"] = 1
    ignore["delle"] = 1
    ignore["dem"] = 1
    ignore["den"] = 1
    ignore["denne"] = 1
    ignore["der"] = 1
    ignore["des"] = 1
    ignore["det"] = 1
    ignore["dette"] = 1
    ignore["di"] = 1
    ignore["die"] = 1
    ignore["dos"] = 1
    ignore["du"] = 1
    ignore["e"] = 1
    ignore["een"] = 1
    ignore["eene"] = 1
    ignore["egy"] = 1
    ignore["ei"] = 1
    ignore["ein"] = 1
    ignore["eine"] = 1
    ignore["einen"] = 1
    ignore["einer"] = 1
    ignore["eines"] = 1
    ignore["eit"] = 1
    ignore["el"] = 1
    ignore["eller"] = 1
    ignore["en"] = 1
    ignore["er"] = 1
    ignore["es"] = 1
    ignore["et"] = 1
    ignore["ett"] = 1
    ignore["eyn"] = 1
    ignore["eyne"] = 1
    ignore["for"] = 1
    ignore["from"] = 1
    ignore["fuer"] = 1
    ignore["fur"] = 1
    ignore["gl"] = 1
    ignore["gli"] = 1
    ignore["ha"] = 1
    ignore["haben"] = 1
    ignore["had"] = 1
    ignore["hai"] = 1
    ignore["has"] = 1
    ignore["hat"] = 1
    ignore["have"] = 1
    ignore["he"] = 1
    ignore["heis"] = 1
    ignore["hen"] = 1
    ignore["hena"] = 1
    ignore["henas"] = 1
    ignore["hers"] = 1
    ignore["het"] = 1
    ignore["hin"] = 1
    ignore["hinar"] = 1
    ignore["hinir"] = 1
    ignore["hinn"] = 1
    ignore["his"] = 1
    ignore["hith"] = 1
    ignore["ho"] = 1
    ignore["hoi"] = 1
    ignore["how"] = 1
    ignore["i"] = 1
    ignore["il"] = 1
    ignore["ili"] = 1
    ignore["in"] = 1
    ignore["inc"] = 1
    ignore["ind"] = 1
    ignore["inside"] = 1
    ignore["into"] = 1
    ignore["is"] = 1
    ignore["ist"] = 1
    ignore["it"] = 1
    ignore["its"] = 1
    ignore["k"] = 1
    ignore["ka"] = 1
    ignore["ke"] = 1
    ignore["la"] = 1
    ignore["las"] = 1
    ignore["le"] = 1
    ignore["les"] = 1
    ignore["lo"] = 1
    ignore["los"] = 1
    ignore["met"] = 1
    ignore["mia"] = 1
    ignore["mit"] = 1
    ignore["more"] = 1
    ignore["na"] = 1
    ignore["ne"] = 1
    ignore["new"] = 1
    ignore["nji"] = 1
    ignore["not"] = 1
    ignore["now"] = 1
    ignore["och"] = 1
    ignore["oder"] = 1
    ignore["of"] = 1
    ignore["og"] = 1
    ignore["on"] = 1
    ignore["or"] = 1
    ignore["os"] = 1
    ignore["others"] = 1
    ignore["ou"] = 1
    ignore["out"] = 1
    ignore["outside"] = 1
    ignore["over"] = 1
    ignore["paa"] = 1
    ignore["par"] = 1
    ignore["pas"] = 1
    ignore["per"] = 1
    ignore["po"] = 1
    ignore["recent"] = 1
    ignore["she"] = 1
    ignore["sie"] = 1
    ignore["sind"] = 1
    ignore["so"] = 1
    ignore["su"] = 1
    ignore["sur"] = 1
    ignore["ta"] = 1
    ignore["than"] = 1
    ignore["that"] = 1
    ignore["the"] = 1
    ignore["then"] = 1
    ignore["they"] = 1
    ignore["this"] = 1
    ignore["through"] = 1
    ignore["to"] = 1
    ignore["toward"] = 1
    ignore["towards"] = 1
    ignore["uber"] = 1
    ignore["ud"] = 1
    ignore["um"] = 1
    ignore["uma"] = 1
    ignore["un"] = 1
    ignore["una"] = 1
    ignore["und"] = 1
    ignore["under"] = 1
    ignore["une"] = 1
    ignore["uno"] = 1
    ignore["unter"] = 1
    ignore["up"] = 1
    ignore["us"] = 1
    ignore["van"] = 1
    ignore["von"] = 1
    ignore["we"] = 1
    ignore["what"] = 1
    ignore["when"] = 1
    ignore["why"] = 1
    ignore["with"] = 1
    ignore["y"] = 1
    ignore["yet"] = 1
    ignore["you"] = 1
    ignore["your"] = 1
    ignore["yr"] = 1
    ignore["zu"] = 1
}

$2 == "STRING"	{ type = "STRING" }

$2 == "RBRACE"	{ type = "" }

$2 == "KEY" \
{
    key = $3
    gsub(/["]/,"",key)
}

$2 == "FIELD" \
{
    field = $3
    gsub(/["]/,"",field)
}

($2 == "VALUE") || ($2 == "ABBREV") \
{
    if (type == "STRING")	# parsing @String{name = "value"}
    {
	if ($2 == "ABBREV")
	    abbrev = $3
	else
	    expansion[abbrev] = $3
    }
    else			# parsing @Name{label, key = "value", ...}
    {
	s = ($2 == "ABBREV") ? expansion[$3] : $3
	if (field == "title")
	    do_title(s)
	else if ((field == "author") || (field == "editor"))
	    do_author_editor(s)
	else if (field == "institution")
	    do_institution(s)
	else if (field == "journal")
	    do_journal(s)
	else if (field == "publisher")
	    do_publisher(s)
	else if (field == "address")
	    do_address(s)
    }
}

function accept(w)
{
    # I don't know why the ignore[] list test fails to
    # catch \&, \tt, et al, so filter them a second time
    # The index() call eliminates \verb|!%@| in Frey and Adams book
    if (w ~ /^[{]*\\[A-Za-z]+[}]*$/)	# discard "{{\booktitle}}" et al
	return 0
    else if (!(tolower(w) in ignore) &&	\
	 (w !~ /^[0-9.]+$/) && \
	 (w !~ /^\\[a-z][a-z]$/) && \
	 (w !~ /^ *$/) && \
	 (w !~ /\\fguill/) && \
	 (w !~ /\\oguill/) && \
	 (w !~ /\\footnote/) && \
	 (w !~ /\\path/) && \
	 (w !~ /\\verb/) && \
	 (w !~ /\\ldots/) && \
	 (index(w,"%") == 0) && \
	 (w !~ /^\\&$/))
	return 1
    else
	return 0
}

function brace_balance(s, k,t,nopen,nclose)
{
    nopen = 0
    nclose = 0
    for (k = 1; k <= length(s); ++k)
    {
	if ((k > 1) && (substr(s,k-1,1) == "\\"))
	    ;		# ignore backslashed braces
	else if (substr(s,k,1) == "{")
	    nopen++
	else if (substr(s,k,1) == "}")
	    nclose++
    }
    t = s
    while (nopen > nclose)
    {
	t = t "}"
	nclose++
    }
    while (nopen < nclose)
    {
	t = "{" t
	nopen++
    }
    return (t)
}

function do_address(s)
{
#    if (s ~ /^\"[^\\]/)
#	sub(/^\"/,"",s)
#    if (s ~ /[^\"]\"$/)
#	sub(/\"$/,"",s)
    do_wordlist(s)		# simple code for now
}

function do_author_editor(s)
{
    do_address(s)		# simple code for now
}

function do_institution(s)
{
    do_address(s)		# simple code for now
}

function do_journal(s)
{
    do_address(s)		# simple code for now
}

function do_publisher(s)
{
    do_address(s)		# simple code for now
}

function do_title(s, k,m,n,np,parts,value,w,words)
{
    value = strip_escapes(strip_math(s))

    if (debug) print "DEBUGvv: [",value,"]" >"/dev/tty"

    gsub(/:/," ",value)		# change colons to spaces (needed in KEY)
    sub(/^"\\"/,"",value)	# remove surrounding "\"...\""
    sub(/\\""$/,"",value)	# on title string

    if (debug) print "DEBUGww: [",value,"]" >"/dev/tty"

    n = split(value,words," ")
    for (k = 1; k <= n; ++k)
    {
        w = simplify(words[k])

	if (debug) print "DEBUGxx: [",words[k],"] -> [",w,"]" >"/dev/tty"

        if (accept(w))
        {
            outword(w)
            if ((index(w,"$") == 0) && (index(w,"-") > 0))
            {	# then have compound non-math word, so index parts
                np = split(w,parts,"-")
                for (m = 1; m <= np; ++m)
                {
                    if (accept(parts[m]))
                        outword(simplify(parts[m]))
                }
            }
            if ((index(w,"$") == 0) && (index(w,"/") > 0))
            {	# then have compound word, so index parts
                np = split(w,parts,"/")
                for (m = 1; m <= np; ++m)
                {
                    if (accept(parts[m]))
                        outword(simplify(parts[m]))
                }
            }
        }
    }
}

function do_wordlist(s, k,m,n,np,parts,value,w,words)
{		# similar to do_title(), except hyphens don't split words
		# and single letters (usually initials) don't get indexed
    value = strip_escapes(strip_math(s))
#    gsub(/:/,";",value)	# change colons to semicolons (needed in KEY)
    n = split(value,words," ")
    for (k = 1; k <= n; ++k)
    {
        w = simplify(words[k])
        if ((length(w) > 1) && accept(w))
        {
            outword(w)
            if ((index(w,"$") == 0) && (index(w,"/") > 0))
            {	# then have compound word, so index parts
                np = split(w,parts,"/")
                for (m = 1; m <= np; ++m)
                {
                    if (accept(parts[m]))
                        outword(simplify(parts[m]))
                }
            }
        }
    }

}

function outword(w, sortkey,w_orig)
{
    w_orig = w
    gsub(/\023/," ",w)		# restore math mode spaces
    sortkey = tolower(w)

    if (debug) print "DEBUGyy: [",w_orig,"] -> [",w,"]" >"/dev/tty"

    gsub(/{\\bf|{\\em|{\\it|{\\rm|{\\sc|{\\sf|{\\sl|{\\tt/,"{",sortkey) # eliminate font changes
    gsub(/[\\{}"'`~ ]/,"",sortkey)
    if ((length(sortkey) > 0) && !(sortkey in ignore))
	printf("%-15s\t%-15s\t%s\t%s\n",sortkey,w,key,field) | SORTPIPE
}

function simplify(s, t)
{				# eliminate unnecessary braces
    if (debug) print "ONE: simplify: " s >"/dev/tty"
    if (s ~ /{\\tt/)
	return s
    else if (s ~ /{\\em/)
	return s
    else if (s ~ /{\\bf/)
	return s
    else
    {
	t = brace_balance(s)
	gsub(/{}}/,"}",t)	# reduce {\TeX{}} to {\TeX}
	gsub(/\\\\["]/,"\\\"",t)# awk's input doubled backslashes: reduce them
	gsub(/\\\\/,"\\",t)	# awk's input doubled backslashes: reduce them
	gsub(/[.;:,!?]$/,"",t)	# eliminate trailing punctuation
	gsub(/^l'/,"",t)	# change French l'usage to usage
	gsub(/^all'/,"",t)	# change Italian all'uso to uso

        gsub(/^[`]+/,"",t)	# remove leading quotes
        gsub(/[']+$/,"",t)	# remove trailing quotes

	gsub(/\\[A-Za-z][A-Za-z]+{/, "{", t)	# [26-Jan-2012]: strip all TeX control words of two or more letters with braced arguments
	gsub(/\\[a-zH]{/, "{", t)		# [26-Jan-2012]: strip TeX accents

#	print "FOUR:  s = [" s "]"
#	print "       t = [" t "]"

        if (debug) print "TWO: simplify: " t >"/dev/tty"
	return t
    }
}

function strip_escapes(s)
{
    gsub(/{\\"/,"{",s)		# reduce \"word\" to {word}
    gsub(/\\"}/,"}",s)
    return (s)
}

function strip_math(s, k,n,parts,t,u)
{
    u = s
    gsub(/\\\\\$/,"\001",u)	# hide literal dollar signs
    if (debug) print "DEBUGaa: [",s,"] -> [",u,"]" >"/dev/tty"
    n = split(u,parts,"$")
    for (k = 1; k <= n; k++)	# protect spaces in math mode sections
    {
	if (k % 2)		# non-math part
	    t = (k > 1) ? t "$ " strip_word(parts[k]) : strip_word(parts[k])
	else			# math mode: protect spaces
	{
	    gsub(/ /,"\023",parts[k])
	    t = t " $" parts[k]
	}
    }
    gsub(/\001/,"\\$",t)	# restore literal dollar signs

    if (debug)			# set debug=1 to check stripping
    {
	print "ONE:   strip_math: " s >"/dev/tty"
	print "TWO:   strip_math: " u >"/dev/tty"
	print "THREE: strip_math: " t >"/dev/tty"
    }
    return t
}

function strip_word(s, t)
{
    t = s
    gsub(/^ +/,"",t)		# eliminate leading space
    gsub(/ +$/,"",t)		# eliminate trailing space
    gsub(/\\c c/,"\\c_c",t)	# le{\c c}ons -> le{\c_c}ons temporarily
    gsub(/\\c{c}/,"\\c_c",t)	# le{\c{c}}ons -> le{\c_c}ons temporarily
    gsub(/\\emdash[{][}]/," ",t)	# change --- separators to space
    gsub(/\\emdash/," ",t)	# change --- separators to space
    gsub(/\\kern/," ",t)	# remove \kern
    gsub(/\\tubissue/," ",t)	# remove \tubissue
    gsub(/---/," ",t)		# change emdash to space
    gsub(/--/," ",t)		# change endash to space

    # [08-Jul-2005] Add zero penalty after slash to allow line breaks and reduce overfull boxes.
    # gsub(/\\\\slash */,"/\\penalty0{}",t)	# change \slash separators to /
    gsub(/\\\\slash */,"/",t)	# change \slash separators to /

    gsub(/^["]\\["]/,"",t)	# eliminate surrounding quotation marks
    gsub(/\\["]["]$/,"",t)	# eliminate surrounding quotation marks
    gsub(/\\\\-/,"",t)		# eliminate discretionary hyphens

    # [26-Oct-2010] Do we want these two steps?  I think not, but there is still a problem with bogus TeX output
    # gsub(/\\\\['^`~=]/, "", t)		# eliminate common accents
    # gsub(/\\\\[cdeHktuvz][{]/, "{", t)	# eliminate less-common accents

    # NB: the next patterns must NOT reduce any standard TeX accents, so EXCLUDE ['^`=cdktuv]
    gsub(/\\\\[;?!,()<>|+@%\]\[]/, " ", t)	# expand control sequences to space
    gsub(/[;?!,()<>|+@%\]\[]/, " ", t)		# expand punctuation to space

    gsub(/\\~/,"\\tilde",t)	# expand bare ties to blanks
    gsub(/[~]/," ",t)
    gsub(/\\tilde/,"~",t)

    gsub(/'s /," ",t)		# eliminate possessives
    gsub(/'s$/," ",t)		# eliminate possessives
    gsub(/s' /," ",t)		# eliminate possessives
    gsub(/s'$/," ",t)		# eliminate possessives
    gsub(/``/," ",t)		# eliminate doubled apostrophes
    gsub(/''/," ",t)		# eliminate doubled apostrophes
    gsub(/\\\//,"",t)		# eliminate italic correction
    gsub(/\\c_c/,"\\c{c}",t)	# restore le{\c_c}ons -> le{\c{c}}ons
    gsub(/\\ /," ",t)		# eliminate \<space>
    if (debug)			# set debug=1 to check stripping
    {
	print "ONE: " s >"/dev/tty"
	print "TWO: " t >"/dev/tty"
    }
    return t
}