From 5e9358b331e99836d6e2173a43aa3aa00cb25653 Mon Sep 17 00:00:00 2001 From: PatR Date: Mon, 18 Dec 2023 17:16:33 -0800 Subject: [PATCH] makedefs: check input files for non-ASCII chars If MAKEDEFS_FILTER_NONASCII is defined (which config.h now does by default), it will check data.base, rumors.*, and {various}.txt for characters outside the range of ' ' through '~'. If it finds any, it will warn about them and change them to '#'. Tab handling is incomplete; the files that use tabs for indentation will allow tabs anywhere, even though that's not wanted. That could be fixed but doesn't seem particularly urgent. This is more about spotting and repairing the special 3-char punctuation characters that crept into data.base fairly recently. --- include/config.h | 31 +++++++++---- util/makedefs.c | 113 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 131 insertions(+), 13 deletions(-) diff --git a/include/config.h b/include/config.h index 193085c9f..6764c021e 100644 --- a/include/config.h +++ b/include/config.h @@ -1,4 +1,4 @@ -/* NetHack 3.7 config.h $NHDT-Date: 1693359531 2023/08/30 01:38:51 $ $NHDT-Branch: keni-crashweb2 $:$NHDT-Revision: 1.175 $ */ +/* NetHack 3.7 config.h $NHDT-Date: 1702948586 2023/12/19 01:16:26 $ $NHDT-Branch: NetHack-3.7 $:$NHDT-Revision: 1.179 $ */ /* Copyright (c) Stichting Mathematisch Centrum, Amsterdam, 1985. */ /*-Copyright (c) Robert Patrick Rankin, 2016. */ /* NetHack may be freely redistributed. See license for details. */ @@ -24,7 +24,7 @@ /* #define TOS */ /* define for Atari ST/TT */ /* #define STUPID */ /* avoid some complicated expressions if - your C compiler chokes on them */ + * your C compiler chokes on them */ /* #define MINIMAL_TERM */ /* if a terminal handles highlighting or tabs poorly, try this define, used in pager.c and termcap.c */ @@ -96,8 +96,8 @@ #ifdef QT_GRAPHICS #ifndef DEFAULT_WC_TILED_MAP -#define DEFAULT_WC_TILED_MAP /* Default to tiles if users doesn't say \ - wc_ascii_map */ +#define DEFAULT_WC_TILED_MAP /* Default to tiles if users doesn't request + * wc_ascii_map */ #endif #ifndef USE_XPM #define USE_XPM /* Use XPM format for images (required) */ @@ -243,11 +243,11 @@ #ifndef CRASHREPORT # ifdef MACOS /* NB: This needs to be a full path unless it's in the playground. */ -//#define CRASHREPORT "NetHackCrashReport.JavaScript" +/*#define CRASHREPORT "NetHackCrashReport.JavaScript"*/ # endif # ifdef __linux__ /* NB: This expects to find the nhlua binary as "./nhlua" */ -//#define CRASHREPORT "nhcrashreport.lua" +/*#define CRASHREPORT "nhcrashreport.lua"*/ # endif #endif @@ -262,6 +262,21 @@ many players, as it saves the player name and the game start time */ /* #define PANICLOG_FMT2 */ +/* + * When building the program, whether the 'makedefs' utility + * checks for non-ASCII or non-printable (control) characters + * in various data files (data.base, rumors.tru, rumors.fal, + * {oracles,epitaphs,engravings,bogusmons}.txt and warns about them. + * They also get changed to '#' instead of possibly remaining + * unprintable. + * + * If you modify the data files to intentionally add accented + * letters or something comparable, comment this out. (Such things + * won't necessarily work as intended within nethack but at least + * makedefs wouldn't reject them.) + */ +#define MAKEDEFS_FILTER_NONASCII + /* * PERSMAX, POINTSMIN, ENTRYMAX, PERS_IS_UID: * These control the contents of 'record', the high-scores file. @@ -599,13 +614,13 @@ typedef unsigned char uchar; #if defined(DEBUG) && !defined(DEBUG_MIGRATING_MONS) #define DEBUG_MIGRATING_MONS /* add a wizard-mode command to help debug - migrating monsters */ + * migrating monsters */ #endif /* SCORE_ON_BOTL is neither experimental nor inadequately tested, but doesn't seem to fit in any other section... */ /* #define SCORE_ON_BOTL */ /* enable the 'showscore' option to - show estimated score on status line */ + * show estimated score on status line */ /* FREE_ALL_MEMORY is neither experimental nor inadequately tested, but it isn't necessary for successful operation of the program */ diff --git a/util/makedefs.c b/util/makedefs.c index 5f93a3c31..8ca54a632 100644 --- a/util/makedefs.c +++ b/util/makedefs.c @@ -1,4 +1,4 @@ -/* NetHack 3.7 makedefs.c $NHDT-Date: 1693083328 2023/08/26 20:55:28 $ $NHDT-Branch: keni-crashweb2 $:$NHDT-Revision: 1.226 $ */ +/* NetHack 3.7 makedefs.c $NHDT-Date: 1702948590 2023/12/19 01:16:30 $ $NHDT-Branch: NetHack-3.7 $:$NHDT-Revision: 1.233 $ */ /* Copyright (c) Stichting Mathematisch Centrum, Amsterdam, 1985. */ /*-Copyright (c) Kenneth Lorber, Kensington, Maryland, 2015. */ /* Copyright (c) M. Stephenson, 1990, 1991. */ @@ -174,7 +174,10 @@ static void do_rnd_access_file(const char *, const char *, unsigned); static boolean d_filter(char *); static boolean h_filter(char *); static void opt_out_words(char *, int *); -static char *fgetline(FILE*); +static char *fgetline(FILE *); +/* doesn't do much (counts lines) if MAKEDEFS_FILTER_NONASCII isn't enabled */ +static void filter_nonascii(char *); +static void set_fgetline_context(const char *, boolean, boolean); #if defined(OLD_MAKEDEFS_OPTIONS) static char *tmpdup(const char *); @@ -184,6 +187,15 @@ static boolean get_gitinfo(char *, char *); static boolean use_enum = TRUE; #endif +/* for MAKEDEFS_FILTER_NONASCII, but not conditionalized; + extra input for fgetline(); not-needed for files that don't use that */ +struct ascii_filter { + const char *filename; + int linenum, warncnt; + boolean dofilter, tabok; +}; +static struct ascii_filter ascii_ctx; + /* input, output, tmp */ static FILE *ifp, *ofp, *tfp; @@ -1036,6 +1048,7 @@ read_rumors_file( perror(infile); return 0L; } + set_fgetline_context(infile, TRUE, FALSE); /* copy the rumors */ while ((line = fgetline(ifp)) != 0) { @@ -1104,6 +1117,8 @@ do_rnd_access_file( #else ifp = getfp(DATA_TEMPLATE, "grep.tmp", RDTMODE, 0); #endif + set_fgetline_context(NULL, FALSE, FALSE); + while ((line = fgetline(ifp)) != 0) { if (line[0] != '#' && line[0] != '\n') { (void) padline(line, padlength); @@ -1162,9 +1177,10 @@ do_rumors(void) /* record the current position; true rumors will start here */ true_rumor_offset = (unsigned long) ftell(tfp); - false_rumor_offset = (unsigned long) read_rumors_file(".tru", &true_rumor_count, - &true_rumor_size, true_rumor_offset, - MD_PAD_RUMORS); + false_rumor_offset + = (unsigned long) read_rumors_file(".tru", &true_rumor_count, + &true_rumor_size, true_rumor_offset, + MD_PAD_RUMORS); if (!false_rumor_offset) goto rumors_failure; @@ -1188,6 +1204,8 @@ do_rumors(void) Fprintf(ofp, rumors_header, Dont_Edit_Data, true_rumor_count, true_rumor_size, true_rumor_offset, false_rumor_count, false_rumor_size, false_rumor_offset, eof_offset); + + set_fgetline_context(NULL, FALSE, FALSE); /* skip the temp file's dummy header */ if (!(line = fgetline(tfp))) { /* "Don't Edit" */ perror(tempfile); @@ -1265,6 +1283,7 @@ do_data(void) Fprintf(ofp, "%s%08lx\n", Dont_Edit_Data, 0L); entry_cnt = line_cnt = 0; + set_fgetline_context(infile, TRUE, TRUE); /* read through the input file and split it into two sections */ while ((line = fgetline(ifp)) != 0) { if (d_filter(line)) { @@ -1301,6 +1320,7 @@ do_data(void) if (rewind(tfp) != 0) goto dead_data; free((genericptr_t) line); + set_fgetline_context(NULL, FALSE, TRUE); /* copy all lines of text from the scratch file into the output file */ while ((line = fgetline(tfp)) != 0) { (void) fputs(line, ofp); @@ -1448,6 +1468,7 @@ do_oracles(void) Fprintf(ofp, "%05lx\n", offset); /* start pos of first oracle */ in_oracle = FALSE; + set_fgetline_context(infile, TRUE, FALSE); while ((line = fgetline(ifp)) != 0) { SpinCursor(3); @@ -1490,6 +1511,7 @@ do_oracles(void) goto dead_data; free((genericptr_t) line); /* copy all lines of text from the scratch file into the output file */ + set_fgetline_context(tempfile, FALSE, FALSE); while ((line = fgetline(tfp)) != 0) { (void) fputs(line, ofp); free((genericptr_t) line); @@ -1598,11 +1620,90 @@ fgetline(FILE *fd) (void) memcpy(cprime, c, len); free((genericptr_t) c); c = cprime; + *(c + len) = '\0'; len = newlen; } + + filter_nonascii(c); return c; } +static void +filter_nonascii(char *line) +{ +#ifdef MAKEDEFS_FILTER_NONASCII + char warnbuf[BUFSZ]; + unsigned char *p; + int warned = 0, prevreason = -1, reason; +#endif + + if (!line) /* end of file; uses 'line' for !MAKEDEFS_FILTER_NONASCII */ + return; + ascii_ctx.linenum += 1; + if (!ascii_ctx.dofilter) + return; + +#ifdef MAKEDEFS_FILTER_NONASCII + for (p = (unsigned char *) line; *p; ++p) { + if (*p == '\n') + break; + if (*p == '\t' && ascii_ctx.tabok) + continue; + reason = (*p > 126) ? 3 : (*p == '\t') ? 2 : (*p < ' '); + if (reason != 0) { + if (!warned) + ascii_ctx.warncnt += 1; /* number of lines warned about */ + if (++warned <= 3) { /* show up to 3 warnings for this line */ + if (warned == 1) { + /*assert(ascii_ctx.filename != NULL);*/ + Sprintf(warnbuf, "? %s:", ascii_ctx.filename); + } else { + Strcpy(warnbuf, ","); + } + Sprintf(eos(warnbuf), " %d.%ld", ascii_ctx.linenum, + (long) ((char *) p - line)); /* column */ + if (reason != prevreason) { + Strcat(warnbuf, (reason == 1) ? " non-printable" + : (reason == 3) ? " non-ascii" + : " "); /* (reason == 2) */ + prevreason = reason; + } + Fprintf(stderr, "%s '%03o'", warnbuf, *p); + } else if (warned == 3 + 1) { /* when more than 3 */ + Fprintf(stderr, ", ..."); /* show an indicator */ + } + *p = '#'; + } + } + if (warned > 0) + Fprintf(stderr, "\n"); +#endif + return; +} + +static void +set_fgetline_context( + const char *current_filename, + boolean do_filtering, + boolean tabs_are_ok) /* moot for !do_filtering */ +{ + static const char dummyname[] = "[makedefs input]"; + +#ifndef MAKEDEFS_FILTER_NONASCII + do_filtering = FALSE; +#endif + if (!current_filename) + current_filename = dummyname; + /* change from relative-to-dat to be relative-to-top, iff that's easy */ + if (!strncmp(current_filename, "../", 3)) + current_filename += 3; + + ascii_ctx.filename = current_filename; + ascii_ctx.linenum = ascii_ctx.warncnt = 0; + ascii_ctx.tabok = tabs_are_ok; + ascii_ctx.dofilter = do_filtering; +} + #if defined(OLD_MAKEDEFS_OPTIONS) void do_date(void) @@ -1822,6 +1923,7 @@ get_gitinfo(char *githash, char *gitbranch) /* perror(infile); */ return FALSE; } + set_fgetline_context(infile, TRUE, TRUE); /* read the gitinfo file */ while ((line = fgetline(gifp)) != 0) { @@ -1983,6 +2085,7 @@ do_dungeon(void) #else ifp = getfp(DATA_TEMPLATE, "grep.tmp", RDTMODE, 0); #endif + set_fgetline_context(NULL, FALSE, TRUE); while ((line = fgetline(ifp)) != 0) { SpinCursor(3);