# /u/sy/beebe/bibnet/keywords.awk, Fri Jul 8 09:49:42 2005 # Edit by Nelson H. F. Beebe # Add zero penalty after slash to allow line breaks and reduce overfull boxes. # # /u/sy/beebe/tex/bib/keywords.awk, Sat Oct 5 16:40:04 1996 # Edit by Nelson H. F. Beebe # Add argument declarations for a few local variables. # # /u/sy/beebe/tex/bib/keywords.awk, Sat Jan 27 09:07:48 1996 # Edit by Nelson H. F. Beebe # Add \path to list of ignored words. # # /u/sy/beebe/tex/bib/keywords.awk, Mon Jan 8 18:59:18 1996 # Edit by Nelson H. F. Beebe # In simplify(), strip leading and trailing quotes. # # /u/sy/beebe/tex/bib/keywords.awk, Mon Jan 8 08:56:08 1996 # Edit by Nelson H. F. Beebe # In do_title(), strip leading and trailing quotes. In strip_math(), # protect non-math dollar signs before splitting. # # /u/sy/beebe/tex/bib/keywords.awk, Mon May 22 13:07:41 1995 # Edit by Nelson H. F. Beebe # In brace_balance(), recognize backslashed braces # # /u/sy/beebe/tex/bib/keywords.awk, Sat Oct 1 18:19:30 1994 # Edit by Nelson H. F. Beebe # In outword(), strip font changes from sortkey, and add two # more fonts to ignore[] list. # # /u/sy/beebe/tex/bib/keywords.awk, Fri Sep 23 01:11:53 1994 # Edit by Nelson H. F. Beebe # Update with selection of several key-specific indexes # # /u/sy/beebe/tex/bib/keywords.awk, Tue Mar 15 15:43:25 1994 # Edit by Nelson H. F. Beebe # Add strip_escapes() to handle {\"word\"} in value strings # # /u/sy/beebe/tex/bib/keywords.awk, Mon Oct 11 19:35:58 1993 # Edit by Nelson H. F. Beebe # # ====================================================================== # Filter the output of bibparse, producing a sorted list of quadruples # of sort keys, title words, their citation tags, and their entry types, # removing unimportant words from the list. The output is suitable for # input to fmtwords.awk for incorporation in a bibliography LaTeX # wrapper file. # # Usage: # bibparse foo.bib | \ # gawk -f keywords.awk | \ # egrep 'entrytype$' | \ # gawk -f fmtwords.awk >foo.ltx # # [05-Oct-1996] # ====================================================================== BEGIN \ { FS = "\t" IGNORECASE = 1 # for gawk, not nawk # SORTPIPE = "sort -u -f -t' ' +0 -1 +2 -3" SORTPIPE = "sort -u -f -t' ' -k1,1 -k2,2" ignore["\\&"] = 1 ignore["{\\&}"] = 1 ignore["\\bf"] = 1 ignore["\\em"] = 1 ignore["\\sf"] = 1 ignore["\\sl"] = 1 ignore["\\tt"] = 1 # The remaining ignore entries need only be in lower case and without # accents, since the sortkey is also matched against this list # Notice that the ignore list contains words from several languages # used in the UofUtah bibliography collection, and includes all the # words in the bibindex/biblook.h BADWORDS list. ignore["bf"] = 1 ignore["em"] = 1 ignore["it"] = 1 ignore["rm"] = 1 ignore["sc"] = 1 ignore["sf"] = 1 ignore["sl"] = 1 ignore["tt"] = 1 ignore["a"] = 1 ignore["ab"] = 1 ignore["aber"] = 1 ignore["about"] = 1 ignore["all"] = 1 ignore["als"] = 1 ignore["an"] = 1 ignore["and"] = 1 ignore["another"] = 1 ignore["ar"] = 1 ignore["are"] = 1 ignore["around"] = 1 ignore["as"] = 1 ignore["at"] = 1 ignore["au"] = 1 ignore["auf"] = 1 ignore["aus"] = 1 ignore["avec"] = 1 ignore["az"] = 1 ignore["bei"] = 1 ignore["bir"] = 1 ignore["but"] = 1 ignore["by"] = 1 ignore["da"] = 1 ignore["das"] = 1 ignore["dat"] = 1 ignore["de"] = 1 ignore["dei"] = 1 ignore["delle"] = 1 ignore["dem"] = 1 ignore["den"] = 1 ignore["denne"] = 1 ignore["der"] = 1 ignore["des"] = 1 ignore["det"] = 1 ignore["dette"] = 1 ignore["di"] = 1 ignore["die"] = 1 ignore["dos"] = 1 ignore["du"] = 1 ignore["e"] = 1 ignore["een"] = 1 ignore["eene"] = 1 ignore["egy"] = 1 ignore["ei"] = 1 ignore["ein"] = 1 ignore["eine"] = 1 ignore["einen"] = 1 ignore["einer"] = 1 ignore["eines"] = 1 ignore["eit"] = 1 ignore["el"] = 1 ignore["eller"] = 1 ignore["en"] = 1 ignore["er"] = 1 ignore["es"] = 1 ignore["et"] = 1 ignore["ett"] = 1 ignore["eyn"] = 1 ignore["eyne"] = 1 ignore["for"] = 1 ignore["from"] = 1 ignore["fuer"] = 1 ignore["fur"] = 1 ignore["gl"] = 1 ignore["gli"] = 1 ignore["ha"] = 1 ignore["haben"] = 1 ignore["had"] = 1 ignore["hai"] = 1 ignore["has"] = 1 ignore["hat"] = 1 ignore["have"] = 1 ignore["he"] = 1 ignore["heis"] = 1 ignore["hen"] = 1 ignore["hena"] = 1 ignore["henas"] = 1 ignore["hers"] = 1 ignore["het"] = 1 ignore["hin"] = 1 ignore["hinar"] = 1 ignore["hinir"] = 1 ignore["hinn"] = 1 ignore["his"] = 1 ignore["hith"] = 1 ignore["ho"] = 1 ignore["hoi"] = 1 ignore["how"] = 1 ignore["i"] = 1 ignore["il"] = 1 ignore["ili"] = 1 ignore["in"] = 1 ignore["inc"] = 1 ignore["ind"] = 1 ignore["inside"] = 1 ignore["into"] = 1 ignore["is"] = 1 ignore["ist"] = 1 ignore["it"] = 1 ignore["its"] = 1 ignore["k"] = 1 ignore["ka"] = 1 ignore["ke"] = 1 ignore["la"] = 1 ignore["las"] = 1 ignore["le"] = 1 ignore["les"] = 1 ignore["lo"] = 1 ignore["los"] = 1 ignore["met"] = 1 ignore["mia"] = 1 ignore["mit"] = 1 ignore["more"] = 1 ignore["na"] = 1 ignore["ne"] = 1 ignore["new"] = 1 ignore["nji"] = 1 ignore["not"] = 1 ignore["now"] = 1 ignore["och"] = 1 ignore["oder"] = 1 ignore["of"] = 1 ignore["og"] = 1 ignore["on"] = 1 ignore["or"] = 1 ignore["os"] = 1 ignore["others"] = 1 ignore["ou"] = 1 ignore["out"] = 1 ignore["outside"] = 1 ignore["over"] = 1 ignore["paa"] = 1 ignore["par"] = 1 ignore["pas"] = 1 ignore["per"] = 1 ignore["po"] = 1 ignore["recent"] = 1 ignore["she"] = 1 ignore["sie"] = 1 ignore["sind"] = 1 ignore["so"] = 1 ignore["su"] = 1 ignore["sur"] = 1 ignore["ta"] = 1 ignore["than"] = 1 ignore["that"] = 1 ignore["the"] = 1 ignore["then"] = 1 ignore["they"] = 1 ignore["this"] = 1 ignore["through"] = 1 ignore["to"] = 1 ignore["toward"] = 1 ignore["towards"] = 1 ignore["uber"] = 1 ignore["ud"] = 1 ignore["um"] = 1 ignore["uma"] = 1 ignore["un"] = 1 ignore["una"] = 1 ignore["und"] = 1 ignore["under"] = 1 ignore["une"] = 1 ignore["uno"] = 1 ignore["unter"] = 1 ignore["up"] = 1 ignore["us"] = 1 ignore["van"] = 1 ignore["von"] = 1 ignore["we"] = 1 ignore["what"] = 1 ignore["when"] = 1 ignore["why"] = 1 ignore["with"] = 1 ignore["y"] = 1 ignore["yet"] = 1 ignore["you"] = 1 ignore["your"] = 1 ignore["yr"] = 1 ignore["zu"] = 1 } $2 == "STRING" { type = "STRING" } $2 == "RBRACE" { type = "" } $2 == "KEY" \ { key = $3 gsub(/["]/,"",key) } $2 == "FIELD" \ { field = $3 gsub(/["]/,"",field) } ($2 == "VALUE") || ($2 == "ABBREV") \ { if (type == "STRING") # parsing @String{name = "value"} { if ($2 == "ABBREV") abbrev = $3 else expansion[abbrev] = $3 } else # parsing @Name{label, key = "value", ...} { s = ($2 == "ABBREV") ? expansion[$3] : $3 if (field == "title") do_title(s) else if ((field == "author") || (field == "editor")) do_author_editor(s) else if (field == "institution") do_institution(s) else if (field == "journal") do_journal(s) else if (field == "publisher") do_publisher(s) else if (field == "address") do_address(s) } } function accept(w) { # I don't know why the ignore[] list test fails to # catch \&, \tt, et al, so filter them a second time # The index() call eliminates \verb|!%@| in Frey and Adams book if (!(tolower(w) in ignore) && \ (w !~ /^[0-9.]+$/) && \ (w !~ /^\\[a-z][a-z]$/) && \ (w !~ /^ *$/) && \ (w !~ /\\fguill/) && \ (w !~ /\\oguill/) && \ (w !~ /\\footnote/) && \ (w !~ /\\path/) && \ (w !~ /\\verb/) && \ (w !~ /\\ldots/) && \ (index(w,"%") == 0) && \ (w !~ /^\\&$/)) return 1 else return 0 } function brace_balance(s, k,t,nopen,nclose) { nopen = 0 nclose = 0 for (k = 1; k <= length(s); ++k) { if ((k > 1) && (substr(s,k-1,1) == "\\")) ; # ignore backslashed braces else if (substr(s,k,1) == "{") nopen++ else if (substr(s,k,1) == "}") nclose++ } t = s while (nopen > nclose) { t = t "}" nclose++ } while (nopen < nclose) { t = "{" t nopen++ } return (t) } function do_address(s) { # if (s ~ /^\"[^\\]/) # sub(/^\"/,"",s) # if (s ~ /[^\"]\"$/) # sub(/\"$/,"",s) do_wordlist(s) # simple code for now } function do_author_editor(s) { do_address(s) # simple code for now } function do_institution(s) { do_address(s) # simple code for now } function do_journal(s) { do_address(s) # simple code for now } function do_publisher(s) { do_address(s) # simple code for now } function do_title(s, k,m,n,np,parts,value,w,words) { value = strip_escapes(strip_math(s)) if (debug) print "DEBUGvv: [",value,"]" >"/dev/tty" gsub(/:/," ",value) # change colons to spaces (needed in KEY) sub(/^"\\"/,"",value) # remove surrounding "\"...\"" sub(/\\""$/,"",value) # on title string if (debug) print "DEBUGww: [",value,"]" >"/dev/tty" n = split(value,words," ") for (k = 1; k <= n; ++k) { w = simplify(words[k]) if (debug) print "DEBUGxx: [",words[k],"] -> [",w,"]" >"/dev/tty" if (accept(w)) { outword(w) if ((index(w,"$") == 0) && (index(w,"-") > 0)) { # then have compound non-math word, so index parts np = split(w,parts,"-") for (m = 1; m <= np; ++m) { if (accept(parts[m])) outword(simplify(parts[m])) } } if ((index(w,"$") == 0) && (index(w,"/") > 0)) { # then have compound word, so index parts np = split(w,parts,"/") for (m = 1; m <= np; ++m) { if (accept(parts[m])) outword(simplify(parts[m])) } } } } } function do_wordlist(s, k,m,n,np,parts,value,w,words) { # similar to do_title(), except hyphens don't split words # and single letters (usually initials) don't get indexed value = strip_escapes(strip_math(s)) # gsub(/:/,";",value) # change colons to semicolons (needed in KEY) n = split(value,words," ") for (k = 1; k <= n; ++k) { w = simplify(words[k]) if ((length(w) > 1) && accept(w)) { outword(w) if ((index(w,"$") == 0) && (index(w,"/") > 0)) { # then have compound word, so index parts np = split(w,parts,"/") for (m = 1; m <= np; ++m) { if (accept(parts[m])) outword(simplify(parts[m])) } } } } } function outword(w, sortkey,w_orig) { w_orig = w gsub(//," ",w) # restore math mode spaces sortkey = tolower(w) if (debug) print "DEBUGyy: [",w_orig,"] -> [",w,"]" >"/dev/tty" gsub(/{\\bf|{\\em|{\\it|{\\rm|{\\sc|{\\sf|{\\sl|{\\tt/,"{",sortkey) # eliminate font changes gsub(/[\\{}\"'`~ ]/,"",sortkey) if ((length(sortkey) > 0) && !(sortkey in ignore)) printf("%-15s\t%-15s\t%s\t%s\n",sortkey,w,key,field) | SORTPIPE } function simplify(s, t) { # eliminate unnecessary braces if (debug) print "ONE: simplify: " s >"/dev/tty" if (s ~ /{\\tt/) return s else if (s ~ /{\\em/) return s else if (s ~ /{\\bf/) return s else { t = brace_balance(s) gsub(/{}}/,"}",t) # reduce {\TeX{}} to {\TeX} gsub(/\\\\["]/,"\\\"",t)# awk's input doubled backslashes: reduce them gsub(/\\\\/,"\\",t) # awk's input doubled backslashes: reduce them gsub(/[.;:,!?]$/,"",t) # eliminate trailing punctuation gsub(/^l'/,"",t) # change French l'usage to usage gsub(/^all'/,"",t) # change Italian all'uso to uso gsub(/^[`]+/,"",t) # remove leading quotes gsub(/[']+$/,"",t) # remove trailing quotes # print "FOUR: s = [" s "]" # print " t = [" t "]" if (debug) print "TWO: simplify: " t >"/dev/tty" return t } } function strip_escapes(s) { gsub(/{\\"/,"{",s) # reduce \"word\" to {word} gsub(/\\"}/,"}",s) return (s) } function strip_math(s, k,n,parts,t,u) { u = s gsub(/\\\\\$/,"",u) # hide literal dollar signs if (debug) print "DEBUGaa: [",s,"] -> [",u,"]" >"/dev/tty" n = split(u,parts,"$") for (k = 1; k <= n; k++) # protect spaces in math mode sections { if (k % 2) # non-math part t = (k > 1) ? t "$ " strip_word(parts[k]) : strip_word(parts[k]) else # math mode: protect spaces { gsub(/ /,"",parts[k]) t = t " $" parts[k] } } gsub(//,"\\$",t) # restore literal dollar signs if (debug) # set debug=1 to check stripping { print "ONE: strip_math: " s >"/dev/tty" print "TWO: strip_math: " u >"/dev/tty" print "THREE: strip_math: " t >"/dev/tty" } return t } function strip_word(s, t) { t = s gsub(/^ +/,"",t) # eliminate leading space gsub(/ +$/,"",t) # eliminate trailing space gsub(/\\c c/,"\\c_c",t) # le{\c c}ons -> le{\c_c}ons temporarily gsub(/\\c{c}/,"\\c_c",t) # le{\c{c}}ons -> le{\c_c}ons temporarily gsub(/\\emdash[{][}]/," ",t) # change --- separators to space gsub(/\\emdash/," ",t) # change --- separators to space gsub(/\\kern/," ",t) # remove \kern gsub(/\\tubissue/," ",t) # remove \tubissue gsub(/---/," ",t) # change emdash to space gsub(/--/," ",t) # change endash to space # [08-Jul-2005] Add zero penalty after slash to allow line breaks and reduce overfull boxes. gsub(/\\\\slash */,"/\\penalty0{}",t) # change \slash separators to / gsub(/^["]\\["]/,"",t) # eliminate surrounding quotation marks gsub(/\\["]["]$/,"",t) # eliminate surrounding quotation marks gsub(/\\\\-/,"",t) # eliminate discretionary hyphens # [26-Oct-2010] Do we want these two steps? I think not, but there is still a problem with bogus TeX output # gsub(/\\\\['^`~=]/, "", t) # eliminate common accents # gsub(/\\\\[cdeHktuvz][{]/, "{", t) # eliminate less-common accents # NB: the next patterns must NOT reduce any standard TeX accents, so EXCLUDE ['^`=cdktuv] gsub(/\\\\[;?!,()<>|+@%\]\[]/, " ", t) # expand control sequences to space gsub(/[;?!,()<>|+@%\]\[]/, " ", t) # expand punctuation to space gsub(/\\~/,"\\tilde",t) # expand bare ties to blanks gsub(/[~]/," ",t) gsub(/\\tilde/,"~",t) gsub(/'s /," ",t) # eliminate possessives gsub(/'s$/," ",t) # eliminate possessives gsub(/s' /," ",t) # eliminate possessives gsub(/s'$/," ",t) # eliminate possessives gsub(/``/," ",t) # eliminate doubled apostrophes gsub(/''/," ",t) # eliminate doubled apostrophes gsub(/\\\//,"",t) # eliminate italic correction gsub(/\\c_c/,"\\c{c}",t) # restore le{\c_c}ons -> le{\c{c}}ons gsub(/\\ /," ",t) # eliminate \ if (debug) # set debug=1 to check stripping { print "ONE: " s >"/dev/tty" print "TWO: " t >"/dev/tty" } return t }