get_rnd_text() selection distribution

While testing 'the("Capitalized Hallucinatory Monster")' I noticed that
some hallucinatory monsters showed up more often than others.  When
the random engravings, epitaphs, and bogus monsters were converted from
hard-coded arrays to data files accessed by random seek (3.6.0), they
became subject to the same distribution irregularites that rumors suffer
from.  The chance that an entry will be chosen depends upon the chance
that a random seek will hit somewhere in the line which precedes it, so
entries that follow long lines are more likely to be chosen and entries
that follow short lines are less likely.  We improved that for rumors
by having makedefs pad the shortest lines.  Distribution still isn't
uniform but is much better than it was (and could be further improved
with a longer padding length at the cost of making data files bigger so
possibly slower to access; both overall size and access speed mattered
back when floppy discs were supported but are probably irrelevant now).

Start doing the same thing for the newer files:  pad the shortest lines
to increase the chance that seek will find them.  The tradeoff is that
the data files become bigger.  Rumors, engravings, and epitaphs lines
are all at least 60 characters now; bogus monsters are at least 20.
These are the data file sizes I see (in bytes:  old, new; padding for
rumors was already in use so its size hasn't changed):
  bogusmon    4449    7211
  engrave     1326    2894
  epitaph    14159   24075
  rumors     49173   49173

The only place that padding is noticeable in-game is #wizrumorcheck.
This commit is contained in:
PatR
2021-11-25 17:57:37 -08:00
parent 3f3d1ad85c
commit 59beebcbcc
3 changed files with 107 additions and 55 deletions

View File

@@ -695,6 +695,12 @@ assigning a fruit name that matches the name of an artifact which doesn't use
any "the" prefix could yield messages showing "the Artifact" when
dealing with the artifact rather than fruit: "You are blasted by _the_
Excalibur's power!"; didn't impact basic inventory formatting
selection of random engravings, epitaphs, and hallucinatory monster names had
the same problem that rumor selection used to have: entries which
follow longer than average lines are most likely to be chosen and
ones which follow shorter than average lines are least likely; use
same workaround as for rumors: pad the shortest lines; result isn't
uniforn distribution but is better (tradeoff vs size; see makedefs)
Fixes to 3.7.0-x Problems that Were Exposed Via git Repository

View File

@@ -40,6 +40,7 @@
* and placed there by 'makedefs'.
*/
static void unpadline(char *);
static void init_rumors(dlb *);
static void init_oracles(dlb *);
static void others_check(const char *ftype, const char *, winid *);
@@ -51,6 +52,24 @@ static void init_CapMons(void);
static unsigned CapMonSiz = 0;
static const char **CapMons = 0;
/* makedefs pads short rumors, epitaphs, engravings, and hallucinatory
monster names with trailing underscores; strip those off */
static void
unpadline(char *line)
{
char *p = eos(line);
/* remove newline if still present; caller should have stripped it */
if (p > line && p[-1] == '\n')
--p;
/* remove padding */
while (p > line && p[-1] == '_')
--p;
*p = '\0';
}
DISABLE_WARNING_FORMAT_NONLITERAL
static void
@@ -158,21 +177,7 @@ getrumor(
g.true_rumor_size = -1; /* don't try to open it again */
}
/* this is safe either way, so do it always since we can't get the
* definition out of makedefs.c
*/
#define PAD_RUMORS_TO
#ifdef PAD_RUMORS_TO
/* remove padding */
{
char *x = eos(rumor_buf) - 1;
while (x > rumor_buf && *x == '_')
x--;
*++x = '\n';
*x = '\0';
}
#endif
unpadline(rumor_buf);
return rumor_buf;
}
@@ -426,8 +431,9 @@ get_rnd_text(const char* fname, char* buf, int (*rng)(int))
(void) dlb_fgets(line, sizeof line, fh);
}
if ((endp = index(line, '\n')) != 0)
*endp = 0;
*endp = '\0';
Strcat(buf, xcrypt(line, xbuf));
unpadline(buf); /* strip padding that makedefs added to end of line */
(void) dlb_fclose(fh);
} else {
couldnt_open_file(fname);

View File

@@ -104,14 +104,29 @@ static const char SCCS_Id[] UNUSED = "@(#)makedefs.c\t3.7\t2020/01/18";
#endif /* else !MAC */
#endif /* else !AMIGA */
/*
* For files where entries are selected by seeking to a random position,
* skipping to newline, and then using the next line, lines that follow
* long ones are more likely to be selected than average and lines that
* follow short ones are less likely to be selected than average.
* We make selection be more evenly distributed by padding the shortest
* lines, at the cost of making the data files bigger. The larger these
* values are, the more uniform the selection will become but the more
* space will be needed for data used for something which is relatively
* inconsequential to actual game play. A value of 0 would suppress the
* padding because every line is already at least that long.
*/
#define PAD_RUMORS_TO 60u /* also used for epitaphs and engravings */
#define PAD_BOGONS_TO 20u /* hallucinatory monsters */
static const char
*Dont_Edit_Code =
"/* This source file is generated by 'makedefs'. Do not edit. */\n",
*Dont_Edit_Data =
"#\tThis data file is generated by 'makedefs'. Do not edit. \n",
*Reference_file =
"/* This file is generated by 'makedefs' for reference purposes only. */\n"
"/* It is no longer used in the NetHack build. */\n";
"/* This file is generated by 'makedefs' for reference purposes only. */\n"
"/* It is no longer used in the NetHack build. */\n";
static struct version_info version;
@@ -151,9 +166,10 @@ static char *name_file(const char *, const char *);
static FILE *getfp(const char *, const char *, const char *, int);
static void do_ext_makedefs(int, char **);
static char *xcrypt(const char *);
static char *padline(char *, unsigned);
static unsigned long read_rumors_file(const char *, int *,
long *, unsigned long);
static void do_rnd_access_file(const char *, const char *);
long *, unsigned long, unsigned);
static void do_rnd_access_file(const char *, const char *, unsigned);
static boolean d_filter(char *);
static boolean h_filter(char *);
static void opt_out_words(char *, int *);
@@ -312,15 +328,17 @@ do_makedefs(char *options)
*/
do_rnd_access_file(EPITAPHFILE,
/* default epitaph: parody of the default engraving */
"No matter where I went, here I am.");
"No matter where I went, here I am.",
PAD_RUMORS_TO);
do_rnd_access_file(ENGRAVEFILE,
/* default engraving: popularized by "The Adventures of
Buckaroo Bonzai Across the 8th Dimenstion" but predates
that 1984 movie; some attribute it to Confucius */
"No matter where you go, there you are.");
"No matter where you go, there you are.",
PAD_RUMORS_TO);
do_rnd_access_file(BOGUSMONFILE,
/* default bogusmon: iconic monster that isn't in nethack */
"grue");
"grue", PAD_BOGONS_TO);
break;
case 'h':
case 'H':
@@ -867,11 +885,45 @@ xcrypt(const char* str)
return buf;
}
#define PAD_RUMORS_TO 60
static char *
padline(char *line, unsigned padlength)
{
/*
* Rumor selection is accomplished by seeking to a random
* position in the file, advancing to newline, and taking
* the next line. Therefore, rumors which follow long-line
* rumors are most likely to be chosen and rumors which
* follow short-line rumors are least likely to be chosen.
* We ameliorate the latter by padding the shortest lines,
* increasing the chance of the random seek landing in them.
*
* Random epitaphs, engravings, and hallucinatory monster
* names are in the same boat.
*/
char *endp;
unsigned len = (unsigned) strlen(line); /* includes newline */
if (len <= padlength) {
endp = index(line, '\n');
/* this is only safe because fgetline() overallocates */
while (len++ < padlength) {
*endp++ = '_';
}
*endp++ = '\n';
*endp = '\0';
}
return line;
}
/* common code for do_rumors(). Return 0 on error. */
static unsigned long
read_rumors_file(const char* file_ext, int* rumor_count,
long* rumor_size, unsigned long old_rumor_offset)
read_rumors_file(
const char *file_ext,
int *rumor_count,
long *rumor_size,
unsigned long old_rumor_offset,
unsigned padlength)
{
char infile[MAXFNAMELEN];
char *line;
@@ -886,26 +938,8 @@ read_rumors_file(const char* file_ext, int* rumor_count,
/* copy the rumors */
while ((line = fgetline(ifp)) != 0) {
#ifdef PAD_RUMORS_TO
/* rumor selection is accomplished by seeking to a random
position in the file, advancing to newline, and taking
the next line; therefore, rumors which follow long-line
rumors are most likely to be chosen and rumors which
follow short-line rumors are least likely to be chosen;
we ameliorate the latter by padding the shortest lines,
increasing the chance of the random seek landing in them */
int len = (int) strlen(line);
(void) padline(line, padlength); /* make shortest lines be longer */
if (len <= PAD_RUMORS_TO) {
char *base = index(line, '\n');
/* this is only safe because fgetline() overallocates */
while (len++ < PAD_RUMORS_TO) {
*base++ = '_';
}
*base++ = '\n';
*base = '\0';
}
#endif
(*rumor_count)++;
#if 0
/*[if we forced binary output, this would be sufficient]*/
@@ -928,7 +962,10 @@ read_rumors_file(const char* file_ext, int* rumor_count,
}
static void
do_rnd_access_file(const char* fname, const char* deflt_content)
do_rnd_access_file(
const char *fname,
const char *deflt_content,
unsigned padlength)
{
char *line, buf[BUFSZ];
@@ -948,15 +985,14 @@ do_rnd_access_file(const char* fname, const char* deflt_content)
exit(EXIT_FAILURE);
}
Fprintf(ofp, "%s", Dont_Edit_Data);
/* lines from the file include trailing newline so make sure that the
default one does too */
if (!index(deflt_content, '\n'))
deflt_content = strcat(strcpy(buf, deflt_content), "\n");
/* write out the default content entry unconditionally instead of
waiting to see whether there are no regular output lines; if it
matches a regular entry (bogusmon "grue"), that entry will become
more likely to be picked than normal but it's nothing to worry about */
(void) fputs(xcrypt(deflt_content), ofp);
Strcpy(buf, deflt_content);
if (!index(buf, '\n')) /* lines from the file include trailing newline +*/
Strcat(buf, "\n"); /* so make sure that the default one does too */
(void) fputs(xcrypt(padline(buf, padlength)), ofp);
tfp = getfp(DATA_TEMPLATE, "grep.tmp", WRTMODE, FLG_TEMPFILE);
grep0(ifp, tfp, FLG_TEMPFILE);
@@ -966,8 +1002,10 @@ do_rnd_access_file(const char* fname, const char* deflt_content)
ifp = getfp(DATA_TEMPLATE, "grep.tmp", RDTMODE, 0);
#endif
while ((line = fgetline(ifp)) != 0) {
if (line[0] != '#' && line[0] != '\n')
if (line[0] != '#' && line[0] != '\n') {
(void) padline(line, padlength);
(void) fputs(xcrypt(line), ofp);
}
free((genericptr_t) line);
}
Fclose(ifp);
@@ -1019,13 +1057,15 @@ do_rumors(void)
/* record the current position; true rumors will start here */
true_rumor_offset = ftell(tfp);
false_rumor_offset = read_rumors_file(
".tru", &true_rumor_count, &true_rumor_size, true_rumor_offset);
false_rumor_offset = read_rumors_file(".tru", &true_rumor_count,
&true_rumor_size, true_rumor_offset,
PAD_RUMORS_TO);
if (!false_rumor_offset)
goto rumors_failure;
eof_offset = read_rumors_file(".fal", &false_rumor_count,
&false_rumor_size, false_rumor_offset);
&false_rumor_size, false_rumor_offset,
PAD_RUMORS_TO);
if (!eof_offset)
goto rumors_failure;