/*********************************************************************** Find proper names at brace level 0 in titles in BibTeX files, assuming string values are delimited by quotation marks, rather than braces. To avoid false positives, it is ESSENTIAL that BibTeX key/value pairs be on single input lines, such as produced by bibclean --max-width 0 filename.bib Output is in a standard form that can be used by text editors to jump to the line reported in the message, and surrounding context is supplied to allow a decision to be made about the jump: jfishresboardcan.bib:03585:State title = "The State of the {West} {Greenland} Current up to 1944", jfishresboardcan.bib:03629:Maritime "Heat Stroke in {Canadian} Maritime Stream Fishes", jfishresboardcan.bib:03827:Sea "Silver Electrodes for Sterilizing Sea-water", The word list is somewhat specialized for the fishery journal collection, with common nouns that often appear together with proper names, as "River" in "Fraser River". The word list of 610+ entries is searched linearly, implying O(N**2) performance, but in practice, execution time on any bibliography is still a second or less, so a fancier search algorithm (binary search, or hash table lookup) has not been implemented. The Proper_Names[] list includes alphabetic words from the list of all of the world countries found at https://www.worldometers.info/geography/alphabetical-list-of-countries/ plus all of the proper nouns in the names of all 50 US states, all 10 Canadian provinces, all 6 Australian states, and their territories. NB: It would be more flexible to have the word list supplied in an external file, but that complicates tool use, so for now, this specialized program is best supplied in a single file that is easily updated and recompiled as experience with it grows. Usage: cc -O3 [-DMAXCONTEXT=nn] [-DMAXLINE=nnnn] find-proper-names.c -o find-proper-names ./find-proper-names infile(s) ./find-proper-names < infile A useful variant is sorting by proper noun candidates, then by file and line number: ./find-proper-names infile(s) | sort -t: -k3,3 -k1,1 -k2,2n [26-Nov-2021] ***********************************************************************/ #include #include #include #include static char * Proper_Names[] = { "Act", "Afghanistan", "Africa", "African", "Alabama", "Alaska", "Alaskan", "Albania", "Alberta", "Aleutian", "Algeria", "Algerian", "Amazon", "America", "Andorra", "Angola", "Antarctic", "Anthropocene", "Antigua", "Antilles", "Arab", "Arabia", "Arabian", "Arc", "Archipelago", "Argentina", "Argentinian", "Arizona", "Arkansas", "Arm", "Armenia", "Artic", "Ascension", "Asia", "Asian", "Association", "Atlantic", "Atol", "Atoll", "Australia", "Australian", "Austria", "Azerbaijan", "Azores", "Baffin", "Bahamas", "Bahia", "Bahrain", "Baie", "Baies", "Baja", "Bangladesh", "Bangladeshi", "Bank", "Banks", "Barbados", "Barbuda", "Barents", "Barrier", "Basin", "Basins", "Bay", "Bayou", "Bays", "Beach", "Bear", "Beaufort", "Belarus", "Belgium", "Belize", "Belt", "Benin", "Bering", "Bermuda", "Bhutan", "Big", "Bight", "Bissau", "Board", "Bolivia", "Borneo", "Bornholm", "Bosnia", "Bothnia", "Bothnian", "Botswana", "Branch", "Brazil", "Brazilian", "Brazzaville", "Breton", "British", "Brittany", "Brook", "Brunei", "Brunswick", "Bulgaria", "Burkina", "Burma", "Burundi", "Cabo", "Caicos", "Caldera", "Caledonia", "California", "Cambodia", "Cameroon", "Canada", "Canal", "Canary", "Canyon", "Cap", "Cape", "Capes", "Carolina", "Caspian", "Cayman", "Celebes", "Central", "Chad", "Channel", "Chesapeake", "Chile", "Chilean", "China", "Chinese", "Chukchi", "City", "Coast", "Colombia", "Colorado", "Columbia", "Commission", "Committee", "Comoros", "Conference", "Congo", "Congolese", "Connecticut", "Continent", "Coral", "Cornwallis", "Costa", "Cote", "Council", "County", "Cove", "Creek", "Croatia", "Croatian", "Cuba", "Cuban", "Current", "Cwm", "Cyclone", "Cyprus", "Czech", "Czechia", "Dakota", "Dale", "Dam", "Danish", "Delaware", "Delta", "Democratic", "Denmark", "Department", "District", "Division", "Djibouti", "Dogger", "Dominica", "Dominican", "Dutch", "Earthquake", "Easter", "Eastern", "Ecuador", "Edward", "Egypt", "Egyptian", "El", "Elephant", "Ellesemere", "Emirates", "England", "English", "Equatorial", "Erie", "Eritrea", "Estonia", "Estuary", "Eswatini", "Ethiopia", "Eurasian", "Europea", "European", "Experimental", "Extension", "Falkland", "Falls", "Faroe", "Faroes", "Faso", "Ferry", "Fiji", "Finland", "Fiord", "Firth", "Fjord", "Florida", "Fork", "Formosa", "Formosan", "Fort", "Forts", "France", "Fraser", "French", "Ft.", "Fuca", "Furrow", "Gabon", "Galapagos", "Gambia", "Georgia", "Germany", "Ghana", "Glacier", "Golfe", "Gorge", "Gorges", "Grand", "Great", "Greater", "Greece", "Greek", "Greenlandic", "Grenada", "Grenadines", "Guadaloupe", "Guam", "Guatemala", "Guiana", "Guinea", "Gulf", "Guyana", "Gyre", "Haiti", "Hampshire", "Harbor", "Harbour", "Hawai'i", "Hawai'ian", "Hawaii", "Hawaiian", "Hebrides", "Hemisphere", "Herzegovina", "High", "Hill", "Himalaya", "Himalayan", "Holocene", "Holy", "Honduras", "Hong", "Hudson", "Hungary", "Huron", "Hurricane" "Hurricane", "Iceland", "Icelandic", "Idaho", "Illinois", "India", "Indian", "Indiana", "Indo", "Indonesia", "Indonesian", "Inlet", "Institute", "Inuit", "Inuk", "Iowa", "Iran", "Iranian", "Iraq", "Iraqi", "Ireland", "Irish", "Island", "Islands", "Isle", "Isles", "Islet", "Israel", "Israelk", "Italian", "Italy", "Ivoire", "Jamaica", "Japan", "Japanese", "Jersey", "Jordan", "Jutland", "Kansas", "Kazakhstan", "Keewatin", "Kentucky", "Kenya", "Key", "Keys", "Kingdom", "Kiribati", "Kitts", "Kong", "Korea", "Korean", "Kuwait", "Kuwaiti", "Kyrgyzstan", "Labrador", "Lac", "Lagoon", "Laguna", "Lake", "Lakes", "Lanka", "Laos", "Latvia", "Law", "Lawrence", "Lebanon", "Lebanonese", "Leone", "Lesotho", "Lesser", "Leste", "Liberia", "Libya", "Libyan", "Liechtenstein", "Lithuania", "Little", "Loch", "Lock", "Lough", "Louisiana", "Low", "Lower", "Lucia", "Luxembourg", "Macedonia", "Mackenzie", "Madagascar", "Madeira", "Madeleine", "Magdalen", "Magellan", "Maine", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Manitoba", "Manx", "Mariana", "Marino", "Maritimes", "Marsh", "Marshall", "Martinique", "Maryland", "Massachusetts", "Mauritania", "Mauritius", "Meeting", "Melanesia", "Mexico", "Michigan", "Micronesia", "Ministry", "Minnesota", "Mississippi", "Missouri", "Moldova", "Monaco", "Mongolia", "Montana", "Montenegro", "Montserrat", "Monument", "Moroccan", "Morocco", "Mount", "Mountain", "Mozambique", "Mt", "Myanmar", "Myocene", "Namibia", "National", "Nations", "Nauru", "Nebraska", "Nepal", "Netherlands", "Nevada", "Nevis", "New", "Newfoundland", "Nicaragua", "Niger", "Nigeria", "North", "Northern", "Norway", "Norwegian", "Nova", "Nunavut", "Occidental", "Ocean", "Oceans", "Ohio", "Oklahoma", "Oman", "Ontario", "Oregon", "Oriental", "Orkney", "Orkneys", "Orleans", "Pacific", "Pakistan", "Pakistani", "Palau", "Palestine", "Panama", "Papua", "Paraguay", "Park", "Parks", "Passage", "Peak", "Peninsula", "Peninsulas", "Pennsylvania", "Persian", "Peru", "Philippine", "Philippines", "Pitcairn", "Plains", "Plateau", "Pleistocene", "Pliocene", "Point", "Poland", "Polynesia", "Polynya", "Pond", "Pool", "Port", "Porto", "Ports", "Portugal", "Portuguese", "Precambrian", "Prince", "Principe", "Proceedings", "Province", "Puerto", "Qatar", "Quebec", "Queensland", "Range", "Reef", "Regency", "Republic", "Reserve", "Reservoir", "Reservoirs", "Reunion", "Rhode", "Rica", "Rico", "Ridge", "Riga", "Rise", "River", "Rivers", "Rockies", "Rocky", "Romania", "Russia", "Rwanda", "Saguenay", "Saint", "Salvador", "Samoa", "San", "Santa", "Santo", "Sao", "Saskatchewan", "Saudi", "Scotch", "Scotia", "Scotland", "Scots", "Scottish", "Sea", "Seabight", "Seamount", "Seas", "See", "Senegal", "Serbia", "Seychelles", "Shallows", "Shanghai", "Shelf", "Shetland", "Shetlands", "Shield", "Shoal", "Shore", "Sierra", "Singapore", "Site", "Slave", "Slope", "Slough", "Slovakia", "Slovenia", "Solomon", "Somali", "Somalia", "Sound", "South", "Southern", "Spain", "Spanish", "Spit", "Spring", "Springs", "Spur", "Sri", "St", "St.", "State", "States", "Station", "Ste", "Ste.", "Storm", "Strait", "Straits", "Stream", "Subcontinent", "Sudan", "Sudanese", "Superior", "Suriname", "Swamp", "Swaziland", "Sweden", "Switzerland", "Symposia", "Symposium", "Syria", "Syrian", "Taiwan", "Taiwanese", "Tajikistan", "Tanzania", "Tasmania", "Tennessee", "Terrace", "Territories", "Territory", "Texan", "Texas", "Thai", "Thailand", "Tidal", "Tide", "Timor", "Tobago", "Togo", "Tome", "Tonga", "Town", "Tract", "Treaty", "Trench", "Trinidad", "Trough", "Tunisia", "Tunisian", "Turkey", "Turkish", "Turkmenistan", "Turks", "Tuvalu", "Typhoon", "USSR", "Uganda", "Ukraine", "United", "Upper", "Uruguay", "Utah", "Uzbekistan", "Valley", "Vancouver", "Vanuatu", "Venezuela", "Verde", "Vermont", "Victoria", "Viet", "Vietnam", "Vincent", "Virginia", "Virginian", "Volcano", "Wales", "Washington", "Waterfall", "Waterfalls", "Weathership", "Weddell", "Welsh", "West", "Western", "Winnipeg", "Wisconsin", "Workshop", "Wyoming", "Yangtze", "Yellow", "Yellowstone", "Yemen", "York", "Yukon", "Zambia", "Zealand", "Zimbabwe", }; #define N (sizeof(Proper_Names) / sizeof(Proper_Names[0])) #if !defined(MAXCONTEXT) #define MAXCONTEXT 120 #endif #if !defined(MAXLINE) #define MAXLINE 10240 #endif static int FNR; /* file line number (name used by awk) */ void check_word(const char * filename, const char * word, const char * context) { int k; for (k = 0; k < N; ++k) { if (strcmp(Proper_Names[k], word) == 0) (void)printf("%s:%05d:%-15s\t%.*s\n", filename, FNR, word, MAXCONTEXT, context); } } void do_file(const char * filename) { FILE *fpin; char *p; char line[MAXLINE], word[MAXLINE]; int brace_level, k, m; if (filename == (const char *)NULL) fpin = stdin; else { fpin = fopen(filename, "r"); if (fpin == (FILE *)NULL) { (void)fprintf(stderr, "ERROR: cannot open %s\n", filename); return; } } FNR = 0; while (fgets(line, MAXLINE, fpin)) { FNR++; brace_level = 0; m = 0; if ( (line[0] == '%') || (line[0] == '@') ) continue; p = strchr(line, '\n'); if (p != (char *)NULL) /* prevent newline from appearing in output context */ *p = '\0'; for (k = 0; (line[k] != '\0') && (k < MAXLINE); ++k) { if (line[k] == '{') brace_level++; else if (isalpha(line[k])) word[m++] = line[k]; else { if ( (brace_level == 0) && (m > 0) ) { word[m++] = '\0'; check_word((filename == (const char *)NULL) ? "/dev/stdin" : filename, word, (k > (MAXCONTEXT/2)) ? &line[k - (MAXCONTEXT/2)] : &line[0]); } m = 0; if (line[k] == '}') brace_level--; } } } if (filename != (const char *)NULL) (void)fclose(fpin); } int main(int argc, char* argv[]) { int k; if (argc > 1) { for (k = 1; k < argc; ++k) do_file(argv[k]); } else do_file((char *)NULL); return (EXIT_SUCCESS); }