/*
	microsat.c version 1.5e

	Eric Minch
	Dept. of Genetics
	Stanford University Medical Center
	Stanford, CA 94305
	minch@lotka.stanford.edu

	Description of program:

Todo list:
	!*Add chosen distance to diversities(?)
	!*Bootstrap diversities
	*Estimate of Nm (Slatkin eq. 15a)
	*?Correlation matrix for diversities
	?Allele resolution
	?Allow nominal data
	?Histogram of alleles by locus by taxon
	?Handle haplotypes (e.g., Y-chromosome data)
	?Hierarchical levels
	?Compound genotypes (e.g., duplicate loci)
	@Goodness-of-fit for trees of individuals
Key for todo list:
	!!Last week
	! Immediately
	? Defer indefinitely
	* Move data description to statistics options
	@ Enable tree comparison analysis

History:
	August 1998, v1.5e
		fixed anomaly in kinship distance with 1-kf transformation; formerly
			generated MAXDIST for no shared alleles, now generates 1
	April 1997, v1.5d
		removed adjustment from average square, allowed adjusted delta mu square
	April 1997, v1.5c
		corrected flaw in adjustment to average square and kinship distances
	February 1997
		finally fixed check for bootstraps in runs with small number of loci
	January 1997, v1.5b
		prevented overflow in calculating number of possible bootstraps
	November 1996, v1.5a
		corrected substitution of median for maximum in frequency table averages
		removed special characters (^2 and mu) from help text
		truncated locus and taxon names on output
	October 1996, v1.5:
		Goodman's standardized Rst
		Help option
	September 1996:
		frequencies and mean repeat size by locus or taxon
		allow adjustment of D1 and Dks: subtract pairwise average within-values
		diversities by locus or taxon
		distance matrix by locus
		pairwise taxon-specific alleles; number by locus and average over loci
		mean, median, mode, minimum, maximum, and frequency distribution by locus
		added entropy to diversities
		Dps, Dks, Dfs with -ln(proportion) or 1-proportion
		check for zero==missing data
		added maximum repeat size to diversities
		reorganized menu structure to separate data transformation options and
			I/O options from distance and statistics options
	August 1996:
		added average and total maximum allele size to diversity indices
	July 1996:
		added check for exhaustive bootstrapping
	March 1996:
		added Fst, D1, Dsw distances
	February 1996:
		added frequency counts
		added mean allele size to diversity indices
		fixed field-checking routine
	January 1996, v1.4g:
		added Fst Het, Het Tot, Var Tot, # alleles, total # alleles, allele range, 
			total allele range, and their averages to the diversity indices
	December 1995, v1.4f:
		added absolute difference distance Dad
	September 1995, v1.4e:
		primer error defaults to 0 if alleles are repeats (or repeat length=1)
	September 1995, v1.4d:
		fixed pesky negative variances (integer overflow problem)
	June 1995, v1.4c:
		included checking for monomorphisms and trapping div by zero in Fst
	June 1995, v1.4b:
		included checking for correct number of fields
	May 1995, v1.4a:
		included trap for negative allele lengths
	May 1995, v1.4: 
		replaced MAXINT with locally-defined BIGINT
		replaced malloc() with error-trapping myalloc()
		replaced buggy < with <= in counting loop for PS-, KF-, and NI-distances
		
Suggested reference:
Minch, E., Ruiz-Linares, A., Goldstein, D., Feldman, M., and Cavalli-Sforza, L.L. (1995, 1996) Microsat (version 1.5b): a computer program for calculating various statistics on microsatellite allele data. WWW: http://lotka.stanford.edu/microsat.html.

Copyright statements:
------------------------------------------------------------------------------
Bag and Lexicon source code is (c) 1989 Eric Minch. All rights reserved.

Microsat source code is (c) 1995 The Board of Trustees of the Leland Stanford Junior University. All rights reserved.
------------------------------------------------------------------------------
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <ctype.h>

/* Utility macros */
#define MAX(a,b) (((a)>(b))?(a):(b))
#define	MIN(a,b) (((a)<(b))?(a):(b))
#define PHI(r) ((r*r-1)/6)
#define TAU(r) MAX(0, PHI(r)/(2*mutationRate))
/* Utility constants */
#define COMPLEMENT (~0)
/* Statistics option flag constants */
#define MISSING 1
#define ANOMALY 2
#define COMPARE 4
#define SPECIFIC 8
#define OUTLIER 16
#define HINGE 32
#define LINEARITY 64
#define FREQUENCY 128
#define DIVERSITY 256
#define BYTAXON 512
#define PAIRWISE 1024
#define CROSSTAB 2048
/* Distance option flag constants */
#define NO 0
#define DM 1
#define PS 2
#define KF 3
#define FS 4
#define NI 5
#define AD 6
#define SW 7
#define RF 8
#define AS 9
#define SR 10
#define BYLOCUS 1
#define LOGARITHM 2
#define ADJUST 4
/* Help option constants */
#define MAIN 0
#define STATISTICS 1
#define DISTANCE 2
/* Bootstrap option flag constants */
#define ALL		1
#define MEAN	2
/* Default constants */
#define DEFAULTMUTATION 0.00056
#define DEFAULTPRIMER 0
#define DEFAULTREPEAT 2
#define DEFAULTSIGMA	3
#define DEFAULTHINGE	4
/* Other constants */
#define BUFFSIZE	1024
#define BIGINT 32767
#define MISSINGDATA -BIGINT
#define MAXDIST	10.0
#define MAXLOCI 500
#define ERROR	0
#define ALPHA 0.05

					/* Memory management */
void *myalloc(int n) {
	static long int total=0;
	void *memory=NULL;
	
	memory=malloc(n);
	total+=n;
	if (memory)
		return memory;
	printf("Error allocating %d bytes of memory; ", n);
	printf("total allocated so far was %ld\n", total-n);
	exit(1);
}

					/* Field checking */
int fieldCheck(char *string) {
/* Check the number of fields. No checking is done as to type or value. */
	int fields;
	char *token;
	char buffer[BUFFSIZE];
	
	strcpy(buffer, string);
	token=strtok(buffer, " \t\n");
	fields=0;
	while (token) {
		fields++;
		token=strtok(NULL, " \t\n");
	}
	if (fields==3||fields==4)
		return 1;
	if (fields)
		printf("Input error: %s", string);
	return 0;
}

					/* Random numbers */
float uran(void) {
/* This is provided only as a default uniform [0,1) pseudorandom number 
	generator. The rand() function in particular is not ideal, and should be 
	replaced by a call to your favorite pseudorandom number generator. */
	return 1.0*rand()/RAND_MAX;
}
	
int iran(int n) {
/* This is intended to return an integer in the range 0..n-1. */
	int m;
	
	while ((m=floor(n*uran()))==n);
	return m;
}

double binomial(int n, int m) {
/* Return n choose m */
	int i, k;
	double answer;

	if (m>(n/2))
		k=n-m;
	else
		k=m;
	if (k==0)
		return 1.0;
	if (k<0)
		return 0.0;
	if (k==1)
		return n;
	answer=1.0;
	for (i=1; i<=k; i++)
		answer*=1.0*(n-k+i)/i;
	return answer;
}

					/* Bags */
struct Bag {
	int itemCount;
	int *data;
};

struct Bag *initBag() {
	struct Bag *bag;

	bag=myalloc(sizeof(struct Bag));
	bag->itemCount=1;
	bag->data=myalloc(sizeof(int));
	bag->data[0]=0;
	return bag;
}

void freeBag(struct Bag *bag) {
	free(bag->data);
	free(bag);
}

void emptyBag(struct Bag *bag) {
	int i;
	
	for (i=0; i<bag->itemCount; i++)
		bag->data[i]=0;
}

void bump(struct Bag *bag, int index, int increment) {
	int i, count;
	int *newdata;
	
	count=bag->itemCount;
	if (index>=count) {
		newdata=myalloc((index+1)*sizeof(int));
		for (i=0; i<count; i++)
			newdata[i]=bag->data[i];
		for (i=count; i<=index; i++)
			newdata[i]=0;
		free(bag->data);
		bag->itemCount=index+1;
		bag->data=newdata;
	}
	bag->data[index]+=increment;
}

void empty(struct Bag *bag, int index) {
	if (index<bag->itemCount)
		bag->data[index]=0;
}

int frequency(struct Bag *bag, int index) {
	if (index<bag->itemCount)
		return bag->data[index];
	else
		return 0;
}

int bagFrequency(struct Bag *bag) {
	int i, result;
	
	result=0;
	for (i=0; i<bag->itemCount; i++)
		result+=bag->data[i];
	return result;
}

float bagMean(struct Bag *bag) {
	int i, frequency;
	float result, count;
	
	count=result=0.0;
	for (i=0; i<bag->itemCount; i++)
		if (frequency=bag->data[i]) {
			count+=frequency;
			result+=i*frequency;
		}
	if (count)
		return result/count;
	else
		return 0;
}

float bagVariance(struct Bag *bag) {
	int i;
	float frequency, sum, count, squares;
	
	count=sum=squares=0.0;
	for (i=0; i<bag->itemCount; i++)
		if (frequency=bag->data[i]) {
			count+=frequency;
			sum+=i*frequency;
			squares+=i*i*frequency;
		}
	if (count>1) {
		sum/=count;
		return (squares-count*sum*sum)/(count-1);
	}
	else
		return 0;
}

int bagCardinality(struct Bag *bag) {
	int i, result;
	
	result=0;
	for (i=0; i<bag->itemCount; i++)
		if (bag->data[i])
			result++;
	return result;
}

int bagMedian(struct Bag *bag) {
	int i, index;
	
	index=bagFrequency(bag)/2;
	for (i=0; index>0; i++)
		if (index>bag->data[i])
			index-=bag->data[i];
		else
			return i;
	return i;
}

void addBag(struct Bag *aBag, struct Bag *bBag) {
	int i, n;
	
	for (i=0; i<bBag->itemCount; i++)
		if (n=bBag->data[i])
			bump(aBag, i, n);
}

struct Bag *bagUnion(struct Bag *aBag, struct Bag *bBag) {
	int i, n;
	struct Bag *result;
	
	result=initBag();
	for (i=0; i<aBag->itemCount; i++)
		if (n=aBag->data[i])
			bump(result, i, n);
	for (i=0; i<bBag->itemCount; i++)
		if (n=bBag->data[i])
			bump(result, i, n);
	return result;
}

struct Bag *bagIntersection(struct Bag *aBag, struct Bag *bBag) {
	int i, n;
	struct Bag *result;
	
	result=initBag();
	n=MIN(aBag->itemCount, bBag->itemCount);
	for (i=0; i<n; i++)
		bump(result, i, MIN(frequency(aBag, i), frequency(bBag, i)));
	return result;
}

int bagMinimum(struct Bag *bag) {
	int i;

	for (i=0; i<bag->itemCount; i++)
		if (bag->data[i])
			return i;
	return -1;
}

int bagMaximum(struct Bag *bag) {
	int i;
	
	for (i=bag->itemCount-1; i>=0; i--)
		if (bag->data[i])
			return i;
	return -1;
}

int bagLoHinge(struct Bag *bag) {
	int i, index;
	
	index=bagFrequency(bag)/4;
	for (i=0; index>0; i++)
		if (index>bag->data[i])
			index-=bag->data[i];
		else
			return i;
	return i;
}

int bagHiHinge(struct Bag *bag) {
	int i, index;
	
	index=bagFrequency(bag)/4;
	for (i=bag->itemCount; index>0; i--)
		if (index>bag->data[i])
			index-=bag->data[i];
		else
			return i;
	return i;
}

int bagMode(struct Bag *bag) {
	int i, index, max;
	
	max=0;
	for (i=0; i<bag->itemCount; i++)
		if (bag->data[i]>max) {
			max=bag->data[i];
			index=i;
		}
	return index;
}

float bagEntropy(struct Bag *bag) {
	int i, m;
	float result, p;
	
	if (bagCardinality(bag)<=1)
		return 0.0;
	result=0.0;
	m=bagFrequency(bag);
	for (i=0; i<bag->itemCount; i++)
		if (p=1.0*bag->data[i]/m)
			result-=p*log(p);
	return result;
}

float continuousRelativeBagEntropy(struct Bag *bag) {
	float result;
	
	if (result=bagEntropy(bag))
		result/=log(bagMaximum(bag)-bagMinimum(bag)+1);
	return result;
}

float *relativeFrequencies(struct Bag *bag) {
	int i;
	float totalFrequency;
	float *result;
	
	result=myalloc(bag->itemCount*sizeof(float));
	totalFrequency=bagFrequency(bag);
	for (i=0; i<bag->itemCount; i++)
		result[i]=bag->data[i]/totalFrequency;
	return result;
}

					/* Lexica */
struct LexEntry {
	char *text;
	int index;
	struct LexEntry *left, *right;
};

struct Lexicon {
	int activeCount, totalCount;
	struct LexEntry *head;
};

struct Lexicon *initLexicon() {
	struct Lexicon *lexicon;
	
	lexicon=myalloc(sizeof(struct Lexicon));
	lexicon->activeCount=0;
	lexicon->totalCount=0;
	lexicon->head=NULL;
	return lexicon;
}

int findLexEntry(struct LexEntry *entry, char *token) {
/* This returns the index of the token if it's in the lexicon, zero if absent. */
	int position;
	
	if (entry==NULL)
		return 0;
	position=strcmp(token, entry->text);
	if (!position) {
		if (entry->index>0)
			return entry->index;
		else return 0;
	}
	else 
		if (position<0)
			return findLexEntry(entry->left, token);
	else return findLexEntry(entry->right, token);
}

int findLexWord(struct Lexicon *lexicon, char *token) {
	return findLexEntry(lexicon->head, token);
}

struct LexEntry *initLexEntry(char *token, int n) {
	struct LexEntry *entry;
	
	entry=myalloc(sizeof(struct LexEntry));
	entry->text=myalloc(strlen(token)+1);
	strcpy(entry->text, token);
	entry->index=n;
	entry->left=NULL;
	entry->right=NULL;
	return entry;
}

int getLexEntry(struct Lexicon *lexicon, struct LexEntry *entry, char *token) {
/* This works like findLexEntry, except if the token is absent from the 
	lexicon, it is added and the newly-generated entry's index is returned. */
	int position;
	
	if (lexicon->totalCount==0) {
		lexicon->totalCount=1;
		lexicon->activeCount=1;
		lexicon->head=initLexEntry(token, 1);
		return 1;
	}
	position=strcmp(token, entry->text);
	if (position==0) {
		if (entry->index<0) {
			entry->index= -entry->index;
			lexicon->activeCount++;
		}
		return entry->index;
	}
	if (position<0) {
		if (entry->left!=NULL) 
			return getLexEntry(lexicon, entry->left, token);
		else {
			lexicon->totalCount++;
			lexicon->activeCount++;
			entry->left=initLexEntry(token, lexicon->totalCount);
			return lexicon->totalCount;
		}
	}
	if (entry->right!=NULL)
		return getLexEntry(lexicon, entry->right, token);
	else {
		lexicon->totalCount++;
		lexicon->activeCount++;
		entry->right=initLexEntry(token, lexicon->totalCount);
		return lexicon->totalCount;
	}
}		

int getLexWord(struct Lexicon *lexicon, char *token) {
	return getLexEntry(lexicon, lexicon->head, token);
}

char *lexItemNode(struct LexEntry *entry, int n) {
	char *answer;
	
	if (entry==NULL)
		return NULL;
	if (n==entry->index)
		return entry->text;
	else if (-n==entry->index)
		return NULL;
	answer=lexItemNode(entry->left, n);
	if (answer==NULL)
		answer=lexItemNode(entry->right, n);
	return answer;
}

char *lexWord(struct Lexicon *lexicon, int n) {
	return lexItemNode(lexicon->head, n);
}

/* Distance functions for main program */

void DMDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* Delta mu squared algorithm from Goldstein, D.B., Ruiz-Linares, A., 
	Cavalli-Sforza, L.L., and Feldman, M.W. (1995) Genetic absolute dating based 
	on microsatellites and the origin of modern humans, _Proc. National Academy 
	of Science, USA_ 92:6723-6727. */
	int i, j, k, ni, nj, locus;
	float num, diff, den;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					den++;
					diff=bagMean(s[i][locus])-bagMean(s[j][locus]);
					num+=diff*diff;
					if (featureFlag&ADJUST)
						num-=bagVariance(s[i][locus])/ni+bagVariance(s[j][locus])/nj;
				}
			}
			if (den)
				d[i][j]=d[j][i]=num/den;
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}
}

void ASDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
	int i, j, k, ni, nj, locus;
	float num, diff, den;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					den++;
					diff=bagMean(s[i][locus])-bagMean(s[j][locus]);
					num+=diff*diff+
						(ni-1)*bagVariance(s[i][locus])/ni+
						(nj-1)*bagVariance(s[j][locus])/nj;
				}
			}
			if (den)
				d[i][j]=d[j][i]=num/den;
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}
}

void SRDistance(int taxa, int loci, struct Bag ***s, float **d, 
	float *variance, int *boot, int featureFlag) {
/* Rst from Slatkin, M. (1995) A measure of population subdivision
	based on microsatellite allele frequencies, _Genetics_, 139:457-462. The
	algorithm used is the unbiased standardized estimate from Goodman, S.J. 
	(1996) RstCALC: a collection of computer programs for calculating unbiased 
	estimates of genetic differentiation and gene flow from microsatellite data 
	and determining their significance, _Molecular Ecology_, in press. Goodman's 
	method calculates pairwise Rst as Sigma^2[B]/(Sigma^2[B]+Sigma^2[W], where
		Sigma^2[W] is (Sigma^2[1]+Sigma^2[2])/2, where
			Sigma^2[i] is the variance of the standardized alleles for taxon i
	and
		Sigma^2[B] is (MS[B]-Epsilon[MSw])/n[0], where
			n[0] is the average sample size, i.e., 
				n[1]+n[2]-(n[1]^2+n[2]^2)/(n[1]+n[2])
			MS[B] is n[1]*Mean[1]+n[2]*Mean[2], where
				n[i] is the number of alleles sampled in taxon i (i.e., 2*#individuals)
			Epsilon[MSw] is n[0]*(Sigma^2[1]/n[1]+Sigma^2[2]/n[2])/2
*/
	int i, j, k, ni, nj, locus;
	float num, den, n0, mi, mj, mij, si, sj, sigmaB, sigmaW, msB, emsW;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) 
				if (variance[k]) {
					locus=boot[k];
					if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
						mi=bagMean(s[i][locus]);
						mj=bagMean(s[j][locus]);
						mij=(mi+mj)/2;
						mi=(mi-mij)/sqrt(variance[locus]);
						mj=(mj-mij)/sqrt(variance[locus]);
						si=bagVariance(s[i][locus])/variance[locus];
						sj=bagVariance(s[j][locus])/variance[locus];
						sigmaW=(si+sj)/2;
						n0=ni+nj-1.0*(ni*ni+nj*nj)/(ni+nj);
						msB=ni*mi*mi+nj*mj*mj;
						emsW=n0*(si/ni+sj/nj)/2;
						sigmaB=(msB-emsW)/n0;
						num+=sigmaB;
						den+=sigmaW;
					}
				}
			if (den+num)
				d[i][j]=d[j][i]=num/(den+num);
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}

}

void PSDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, 
	int featureFlag) {
/* The general definition of the proportion of shared alleles at a given locus, 
	which holds true whether the taxa are individuals or population samples, is 
	the mean over loci of the sums of the minima of the relative frequencies of 
	all alleles in the taxonomic units being compared (individuals or 
	populations). Thus, for two individuals with genotypes at a single locus:
	 
	individual
	genotypes		shared proportions
		ab	cd		MIN(0.5,0.0) + MIN(0.5,0.0) + MIN(0.5,0.0) + MIN(0.5,0.0) = 0.0
		ab	cb		MIN(0.5,0.0) + MIN(0.5,0.5) + MIN(0.5,0.0) = 0.5
		ab	bb		MIN(0.5,0.0) + MIN(0.5,1.0) = 0.5
		ab	ab		MIN(0.5,0.5) + MIN(0.5,0.5) = 1.0
		aa	aa		MIN(1.0,1.0) = 1.0
		
	For two populations, X and Y, with allele frequencies at a single locus:
allele	X		Y
		a 0.2 0.2
		b 0.4 0.0
		c 0.3 0.7
		d 0.1 0.1
	the sum of the minima is 
		MIN(0.2, 0.2) + MIN(0.4, 0.0) + MIN(0.3, 0.7) + MIN(0.1, 0.1) = 0.6
	The distance can be taken as 1-proportion or -ln(proportion), as specified
	by the featureFlag. */
	int i, j, k, l, min, max, locus;
	float ni, nj, num, den;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					den++;
					min=MIN(bagMinimum(s[i][locus]), bagMinimum(s[j][locus]));
					max=MAX(bagMaximum(s[i][locus]), bagMaximum(s[j][locus]));
					for (l=min; l<=max; l++)
						num+=MIN(frequency(s[i][locus], l)/ni, 
							frequency(s[j][locus], l)/nj);
				}
			}
			if (den==0)
				d[j][i]=d[i][j]=MISSINGDATA;
			else if (featureFlag&LOGARITHM)
				if (num)
					d[j][i]=d[i][j]= -log(num/den);
				else
					d[j][i]=d[i][j]=MAXDIST;
			else
					d[j][i]=d[i][j]=1-num/den;
		}
}

void KFDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* The kinship coefficient is defined (Cavalli-Sforza, L.L., and Bodmer, W.F. 
	(1971) _The Genetics of Human Populations_, p. 399) as "the probability that 
	a gene taken at random from I, at a given locus, be identical by descent to a 
	gene taken at random from J at the same locus." Thus the individual genotypes
	below have kinship coefficients of
		ab	cd		0.5*0.0 + 0.5*0.0 + 0.5*0.0 + 0.5*0.0 = 0.0
		ab	cb		0.5*0.0 + 0.5*0.5 + 0.5*0.0 = 0.25
		ab	bb		0.5*0.0 + 0.5*1.0 = 0.5
		ab	ab		0.5*0.5 + 0.5*0.5 = 0.5
		aa	aa		1.0*1.0 = 1.0
	Generalizing, for two populations with allele frequencies
		a .1	.2
		b .2	0
		c .3	.7
		d .4	.1
	the kinship coefficient is 
		0.1*0.2 + 0.2*0.0 + 0.3*0.7 + 0.4*0.1 = 0.02 + 0.0 + 0.21 + 0.04 = 0.27
	The distance can be taken as the negative logarithm of the coefficient or as
	1-coefficient, as specified by the featureFlag. */
	int i, j, k, l, min, max, locus;
	float ni, nj, num, den, di, dj, fi, fj;
	
	if (featureFlag&BYLOCUS)
		loci=1;
/* Now the pairwise distances */
	for (i=1; i<taxa; i++) {
		for (j=0; j<i; j++) {
			num=den=di=dj=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					den++;
					min=MIN(bagMinimum(s[i][locus]), bagMinimum(s[j][locus]));
					max=MAX(bagMaximum(s[i][locus]), bagMaximum(s[j][locus]));
					for (l=min; l<=max; l++) {
						fi=frequency(s[i][locus], l)/ni;
						fj=frequency(s[j][locus], l)/nj;
						num+=fi*fj;
						di+=fi*fi;
						dj+=fj*fj;
					}
				}
			}
			if (den==0)
				d[j][i]=d[i][j]=MISSINGDATA;
			else if (featureFlag&LOGARITHM)
				if (num)
					if (featureFlag&ADJUST)
						d[j][i]=d[i][j]=log((di+dj)/(2*den))-log(num/den);
					else
						d[j][i]=d[i][j]= -log(num/den);
				else
					d[j][i]=d[i][j]=MAXDIST;
			else
				if (featureFlag&ADJUST)
					d[j][i]=d[i][j]=((di+dj)/(2*den))-num/den;
				else
					d[j][i]=d[i][j]=1-num/den;
		}
	}
}

void FSDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* The fuzzy set similarity for a pair of taxa is the ratio between the 
	cardinality of the intersection of their alleles and the cardinality of the
	union of their alleles, e.g., if two individuals have genotypes ab and ac,
	the intersection is {a}, the union is {a,b,c}, and the ratio is 1/3. Thus the 
	individual genotypes below have ratios of
		ab	cd		0/4 = 0.0
		ab	cb		1/3 = 0.3...
		ab	bb		1/2 = 0.5
		ab	ab		2/2 = 1.0
		aa	aa		1/1 = 1.0
	The distance can taken as the negative logarithm of the ratio or as 1-ratio, 
	as specified by the featureFlag. */
	int i, j, k, n, locus;
	float num, den;
	struct Bag *bag;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0.0; n=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if (bagFrequency(s[i][locus])&&bagFrequency(s[j][locus])) {
					n++;
					bag=bagIntersection(s[i][locus], s[j][locus]);
					num+=bagCardinality(bag);
					freeBag(bag);
					bag=bagUnion(s[i][locus], s[j][locus]);
					den+=bagCardinality(bag);
					freeBag(bag);
				}
			}
			if (n)
				if (num)
					if (featureFlag&LOGARITHM)
						d[j][i]=d[i][j]= -log(num/den);
					else
						d[j][i]=d[i][j]=1-num/den;
				else
					d[j][i]=d[i][j]=MAXDIST;
			else
				d[j][i]=d[i][j]=MISSINGDATA;
		}
}

void NIDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* Nei's identity for two taxa, from Genetics, 105, p.772 (Nov. 1983). The
	distance is taken as the negative logarithm. */
	int i, j, k, l, min, max, n, locus;
	float ni, nj, num, f1, f2, sum1, sum2, den1, den2;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			n=num=den1=den2=n=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					n++;
					min=MIN(bagMinimum(s[i][locus]), bagMinimum(s[j][locus]));
					max=MAX(bagMaximum(s[i][locus]), bagMaximum(s[j][locus]));
					sum1=sum2=0;
					for (l=min; l<=max; l++) {
						f1=frequency(s[i][locus], l)/ni;
						f2=frequency(s[j][locus], l)/nj;
						num+=f1*f2; 
						sum1+=f1*f1;
						sum2+=f2*f2;
					}
					den1+=(ni*sum1-1)/(ni-1);
					den2+=(nj*sum2-1)/(nj-1);
				}
			}
			if (n)
				if (num)
					if (featureFlag&LOGARITHM)
						d[j][i]=d[i][j]= -log(num/(sqrt(den1*den2)));
					else
						d[j][i]=d[i][j]=1-num/(sqrt(den1*den2));
				else
					d[j][i]=d[i][j]=MAXDIST;
			else
				d[j][i]=d[i][j]=MISSINGDATA;
		}
}

void ADDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* Absolute difference algorithm suggested by L.L. Cavalli-Sforza (1995)
	personal communication, and by D. Goldstein (1996) personal communication. */
	int i, j, k, locus;
	float num, den;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if (bagFrequency(s[i][locus])&&bagFrequency(s[j][locus])) {
					den++;
					num+=fabs(bagMean(s[i][locus])-bagMean(s[j][locus]));
				}
			}
			if (den)
				d[i][j]=d[j][i]=num/den;
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}
}

void SWDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* Absolute product algorithm from Shriver, M.D., Jin, L., Boerwinkle, E., 
	Deka, R., Ferrell, R.E., and Chakraborty, R. (1995) A novel measure of 
	genetic distance for highly polymorphic tandem repeat loci, _Mol. Biol. 
	Evol._, 12(5):914-920. */
	int i, j, k, l1, l2, locus, min, max;
	float dxw, dyw, dxyw, den, diff, ni, nj;
	float *fi, *fj;
	
	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			den=dxw=dyw=dxyw=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					den++;
					min=MIN(bagMinimum(s[i][locus]), bagMinimum(s[j][locus]));
					max=MAX(bagMaximum(s[i][locus]), bagMaximum(s[j][locus]));
					fi=relativeFrequencies(s[i][locus]);
					fj=relativeFrequencies(s[j][locus]);
					for (l1=min; l1<=max; l1++)
						for (l2=min; l2<=max; l2++) {
							diff=fabs(l1-l2);
							dxw+=ni*fi[l1]*fi[l2]*diff/(ni-1);
							dyw+=nj*fj[l1]*fj[l2]*diff/(nj-1);
							dxyw+=fi[l1]*fj[l2]*diff;
						}
					free(fi);
					free(fj);
				}
			}
			if (den) {
				diff=dxyw-(dxw+dyw)/2;
				if (diff<0)
					d[i][j]=d[j][i]=0;
				else
					d[i][j]=d[j][i]=diff/den;
			}
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}
}

void RFDistance(int taxa, int loci, struct Bag ***s, float **d, int *boot, int featureFlag) {
/* From Reynolds, J., Weir, B.S., and Cockerham, C.C. (1983) Estimation of the 
	coancestry coefficient: basis for a short-term genetic distance. _Genetics_, 
	105:767-779, p. 769. */
	int i, j, k, l, locus, min, max;
	float sum, num, den, ni, nj, ai, aj;
	float *fi, *fj;

	if (featureFlag&BYLOCUS)
		loci=1;
	for (i=1; i<taxa; i++)
		for (j=0; j<i; j++) {
			num=den=0;
			for (k=0; k<loci; k++) {
				locus=boot[k];
				if ((ni=bagFrequency(s[i][locus]))&&(nj=bagFrequency(s[j][locus]))) {
					min=MIN(bagMinimum(s[i][locus]), bagMinimum(s[j][locus]));
					max=MAX(bagMaximum(s[i][locus]), bagMaximum(s[j][locus]));
					sum=0;
					ai=aj=1;
					fi=relativeFrequencies(s[i][locus]);
					fj=relativeFrequencies(s[j][locus]);
					for (l=min; l<=max; l++) {
						sum+=(fi[l]-fj[l])*(fi[l]-fj[l]);
						ai-=fi[l]*fi[l];
						aj-=fj[l]*fj[l];
					}
					num+=sum/2-((ni+nj)*(ni*ai+nj*aj))/(4*ni*nj*(ni+nj-1));
					den+=sum/2+((4*ni*nj-ni-nj)*(ni*ai+nj*aj))/(4*ni*nj*(ni+nj-1));
					free(fi);
					free(fj);
				}
			}
			if (den)
				d[i][j]=d[j][i]= -log(1-num/den);
			else
				d[i][j]=d[j][i]=MISSINGDATA;
		}
}

/* Ancillary functions for main program */

float heterozygosity(struct Bag *bag) {
	int i;
	float f, total, result;
	
	result=1.0;
	total=bagFrequency(bag);
	for (i=0; i<=bagMaximum(bag); i++)
		if (f=(frequency(bag, i)/total))
			result-=f*f;
	return result;
}

float fst(int taxa, int locus, struct Bag ***s, struct Bag *total) {
/* Calculate Fst for one locus */
	int i, j, min, max;
	float a, b, deviation, sumSquare, totalInds, totalIndsSquare;
	float *inds;

	min=bagMinimum(total);
	max=bagMaximum(total);
	if (min==max)
		return 0.0;
	totalInds=totalIndsSquare=0.0;
	inds=myalloc(taxa*sizeof(float));
	for (i=0; i<taxa; i++) {
		inds[i]=bagFrequency(s[i][locus])/2.0;
		totalInds+=inds[i];
		totalIndsSquare+=inds[i]*inds[i];
	}
	a=b=0.0;
	for (i=0; i<taxa; i++) {
		b+=inds[i]*heterozygosity(s[i][locus]);
		sumSquare=0;
		if (inds[i])
			for (j=min; j<=max; j++) {
				deviation=frequency(s[i][locus], j)/(2*inds[i])
					-frequency(total, j)/(2*totalInds);
				sumSquare+=deviation*deviation;
			}
		a+=inds[i]*sumSquare*2;
	}
	b=2*b/(2*totalInds-taxa);
	a=(a-(taxa-1)*b)/(2*(totalInds-totalIndsSquare/totalInds));
	free(inds);
	return a/(a+b);
}

float rst(int taxa, int locus, struct Bag ***s, struct Bag *total) {
/* Calculate standardized Rst for one locus */
	int i, n, ni;
	float mi, vi, mt, vt, nt, nt2, MSb, MSwErr, SigmaW, SigmaB;
	
	mt=bagMean(total);
	vt=bagVariance(total);
	nt=nt2=MSb=MSwErr=SigmaW=n=0;
	for (i=0; i<taxa; i++) 
		if (ni=bagFrequency(s[i][locus])) {
			n++;
			mi=(bagMean(s[i][locus])-mt)/sqrt(vt);
			vi=bagVariance(s[i][locus]);
			nt+=ni;
			nt2+=ni*ni;
			MSb+=ni*mi*mi;
			MSwErr+=vi/(ni*vt);
			SigmaW+=vi/vt;
		}
	nt=(nt-(nt2/nt))/(n-1);
	MSwErr*=nt/n;
	MSb/=n-1;
	SigmaW/=n;
	SigmaB=(MSb-MSwErr)/nt;
	return SigmaB/(SigmaB+SigmaW);
}

int minimumIncrement(struct Bag *bag, int min, int max) {
	int i, next, increment;
	
	increment=BIGINT;
	for (i=min; i<max; i+=next) {
		next=1;
		while (!frequency(bag, i+next))
			next++;
		if (next<increment)
			increment=next;
	}
	return increment;
}

void showIncrement(struct Bag *bag, int min, int max, int increment) {
	int i, next, flag=1;
	
	printf("  Alleles");
	for (i=min; i<max; i+=next) {
		next=1;
		while (!frequency(bag, i+next))
			next++;
		if (next==increment) {
			printf("%s %d and %d", flag ? ":" : ",", i, i+next);
			flag=0;
		}
	}
	printf("\n");
}

void help(int menu, char option) {
	char buffer[BUFFSIZE];
	const char *helptext[]={
		"This specifies whether alleles are given as nucleotide sizes or repeat numbers.", 
		"Bootstrapping can be used either to generate many distance matrices or to estimate the standard errors of distances. If you request bootstraps, you'll be asked how many, and whether all of the matrices should be output or only their mean and standard error.",
		"",
		"Use this submenu to specify which distance measure you want.",
		"","","","",
		"This is the name of the file from which data should be read.",
		"","",
		"Distances can be calculated over all loci or separately for each locus.",
		"","",
		"This is the name of the file where results should be written.",
		"","",
		"This specifies the repeat length (e.g., dinucleotide=2, trinucleotide=3, etc.)",
		"Use this submenu to specify which statistics you want.",
		"","","","","","","",
		"The anomalous frequency option checks for odd frequency counts.",
		"","","",
		"The primer (flanking region) error is used in calculating the linearity estimate.",
		"The frequency distribution option prints the mean, median, mode, minimum, and maximum allele size, and the frequency of each allele, by locus or by taxon.",
		"","",
		"The incommensurable taxa option checks for pairs of taxa with no loci in common.",
		"","",
		"The linearity option prints the duration of linearity estimated according to each locus.",
		"The missing data option checks for taxon-locus combinations with no data.",
		"",
		"The outlier option checks for alleles which are either much shorter or much longer than expected; the criterion is user-specifiable by multiples of interquartile range above/below low or high quartiles, or by standard deviations above/below the mean.",
		"The pairwise-specific allele option checks each pair of taxa for alleles present in one taxon but not in the other.",
		"",
		"The mutation rate is used in calculating the linearity estimate.",
		"",
		"The taxon-specific allele option checks for alleles present in only one taxon.",
		"",
		"The variability option calculates fifteen diversity indices: Fst (by variance and by heterozygosity methods), standardized Rst, and average and total values for heterozygosity, variance, number of alleles, allele size range, maximum allele size, and entropy of allele size distribution.",
		"",
		"The crosstabulation option shows how many data values are found in each taxon-locus combination.",
		"","",
		"D1: The average square distance was derived employing the analytical theory developed by Moran for the distribution of alleles mutating under a strict stepwise mutation process in a population of finite constant size with non-overlapping generations. It and its family of related distances is superior to other distances for microsatellites in they have a linear expectation with time making them good for evolutionary studies. D1 = sum over alleles i and i' of ((i-i')^2 ni ni'), where ni is the number of alleles with i repeats (the prime indicating another population).",
		"","",
		"L.L. Cavalli-Sforza (1995) personal communication, and D. Goldstein (1996) personal communication.",
		"",
		"Fst: The coancestry coefficient Theta has been estimated in Reynolds et al. as Theta = a / (a+b), where a is the variance between taxa and b is the variance within taxa. The distance Fst = -ln(1-Theta).",
		"","","","",
		"Dkf: The kinship coefficient kf is defined as 'the probability that a gene taken at random from I, at a given locus, be identical by descent to a gene taken at random from J at the same locus.' In general, for two individuals (or populations) A and B, it is calculated as kf = sum over all alleles of P[A(i)] P[B(i)], where P[A(i)] is the proportion (relative frequency) of allele i in individual (or population) A. The distance Dkf = -ln(kf), or Dkf' = 1-kf.",
		"",
		"Ddm: The (delta mu)^2 measure of distance (Ddm) for microsatellites was derived in order to improve on D1. Ddm has a smaller variance than D1 and, in populations at mutation-drift equilibrium, Ddm is independent of population size. It is based not on the mean squared difference, as is D1, but on the squared mean difference between alleles of two populations. Thus Ddm = (mu(A)-mu(B))^2, where mu(A) is the mean allele size for population A.",
		"Gst: Nei's identity for two taxa is defined as id = (sum over all alleles of P[A(i)] P[B(i)])/sqrt(sum over alleles of P[A(i)]^2 ž sum over alleles of P[B(i)]^2). Nei's standard distance Gst = -ln(id), or Gst' = 1-id.",
		"",
		"Dps: The proportion of shared alleles is defined as the mean of the minima of the relative frequencies of all alleles in the taxonomic units being compared (individuals or populations), i.e.: ps = (sum over all alleles of MIN{P[A(i)], P[B(i)]})/n, where n is the total number of alleles for all loci. The distance Dps = -ln(ps), or Dps' = 1-ps.",
		"",
		"Rst: Slatkin's Rst is an analogue of Wright's Fst, adapted to microsatellite loci by assuming a high-rate stepwise mutation model instead of a low-rate K- or infinite-allele mutation model. It's defined as Rst = (Sbar - Sw)/Sbar, where Sw is the sum over all loci of twice the weighted mean of the within-population variances V(A) and V(B), and Sbar is the sum over all loci of twice the variance V(A+B) of the combined population. Goodman's method, used here, calculates pairwise Rst as Vb/(Vb+Vw) using standardized repeat numbers, where Vw, within-group variance, is (V(A)+V(B))/2; Vb, the between-group variance is (MSb-MSwErr)/Nbar; Nbar, the average sample size, is nA+nB-(nA^2+nB^2)/(nA+nB); MSb, the mean square between groups, is nA*Xbar[A]+nB*Xbar[B]; and MSwErr, the within-group error, is Nbar*(V(A)/nA+V(B)/nB)/2.",
		"Dfs: The fuzzy set similarity measure is calculated by finding the set of alleles in population A (call it set A), the set in population B (call it set B), and dividing the cardinality of their intersection by the cardinality of their union, i.e., fs = |A^B| Ÿ |AvB|. The distance Dfs = -ln(fs), or Dfs' = 1-fs.",
		"","","",
		"Dsw: The stepwise weighted genetic distance measure is 'an extension of Nei's minimum genetic distance', and is based on frequency-weighted means of the absolute value of the difference in number of repeats over pairs of alleles i and j, both within and between populations. In general, for two individuals (or populations) X and Y, it is calculated as Dsw = dxyw - (dxw + dyw)/2, where dxw is the weighted mean within X, dyw is the weighted mean within Y, and dxyw is the weighted mean between X and Y, i.e., where i is in X and j is in Y.",
	"","",""};

	if (option) switch (option) {
		case '-':
			printf("For help on an option, type ? followed by the option.\n");
			return;
		case '?':
			printf("The help option prints a short explanation of a menu item. To return to the menu, hit <Return>.\n");
			break;
		case '!':
			if (menu==MAIN)
				printf("This begins the analysis.\n");
			else if (menu==STATISTICS)
				printf("This returns to the main menu.\n");
			else
				help(MAIN, '-');
			break;
		case '0':
			if (menu==DISTANCE)
				printf("This returns to the main menu with no distance selected.\n");
			else
				help(MAIN, '-');
			break;
		default:
			strcpy(buffer, helptext[option-'A'+menu*('Z'-'A'+1)]);
			if (strlen(buffer))
				printf("%s\n", buffer);
			else
				help(MAIN, '-');
			break;
	}
	else help(MAIN, '-');
	gets(buffer);
}

void getDistance(int *distanceFlag, int *featureFlag) {
	int n;
	char buffer[BUFFSIZE];
	
	printf("\nSelect the type of distance measure you want:\n");
	printf("  (A) Average square (D1)\n");
	printf("  (D) Absolute difference (Dad)\n");
	printf("  (F) Fst\n");
	printf("  (K) Kinship coefficient (Dkf)\n");
	printf("  (M) Delta mu squared (Ddm)\n");
	printf("  (N) Nei's standard (Gst)\n");
	printf("  (P) Proportion shared alleles (Dps)\n");
	printf("  (R) Standardized Rst\n");
	printf("  (S) Fuzzy set similarity (Dfs)\n");
	printf("  (W) Absolute product (Dsw)\n");
	printf("  (0) None\n");
	printf("  (?X) Help on option X\n");
	gets(buffer);
	n=toupper(buffer[0]);
	switch (n) {
		case '0': *distanceFlag=NO; break;
		case 'A': *distanceFlag=AS; break;
		case 'M': *distanceFlag=DM; break;
		case 'D': *distanceFlag=AD; break;
		case 'W': *distanceFlag=SW; break;
		case 'N': *distanceFlag=NI; break;
		case 'F': *distanceFlag=RF; break;
		case 'R': *distanceFlag=SR; break;
		case 'P': *distanceFlag=PS; break;
		case 'K': *distanceFlag=KF; break;
		case 'S': *distanceFlag=FS; break;
		case '?':
			help(DISTANCE, toupper(buffer[1]));
			getDistance(distanceFlag, featureFlag);
			break;
		default: 
			help(MAIN, '-');
			getDistance(distanceFlag, featureFlag);
	}
	if (*distanceFlag==PS||*distanceFlag==KF||
		*distanceFlag==FS||*distanceFlag==NI) {
		do {
			switch (*distanceFlag) {
				case PS: sprintf(buffer, "ps"); break;
				case KF: sprintf(buffer, "kf"); break;
				case FS: sprintf(buffer, "fs"); break;
				case NI: sprintf(buffer, "ni"); break;
			}
			printf("Transform by (L) -ln(%s) or by (M) 1-(%s)? ", buffer, buffer);
			gets(buffer);
			n=toupper(buffer[0]);
		} while (n!='L'&&n!='M');
		if (n=='L')
			*featureFlag|=LOGARITHM;
		else
			*featureFlag&=(LOGARITHM^COMPLEMENT);
	}
	if (*distanceFlag==KF) {
		printf("Adjust distance by subtracting pairwise average within? ");
		gets(buffer);
		if (toupper(buffer[0])=='Y')
			*featureFlag|=ADJUST;
		else
			*featureFlag&=(ADJUST^COMPLEMENT);
	}
	else if (*distanceFlag==DM) {
		printf("Adjust distance for small sample size? ");
		gets(buffer);
		if (toupper(buffer[0])=='Y')
			*featureFlag|=ADJUST;
		else
			*featureFlag&=(ADJUST^COMPLEMENT);
	}
}

void getStatistics(int *statisticsFlag, int *alleleFlag,
	int *outlierCriterion, float *mutationRate, int *primerError) {
	int n;
	char buffer[BUFFSIZE];
	
	printf("\nSelect the statistics you want:\n");
	printf("  %-30s [%s]\n", "(M)issing data", 
		*statisticsFlag&MISSING ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(A)nomalous frequencies",
		*statisticsFlag&ANOMALY ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(I)ncommensurable taxa", 
		*statisticsFlag&COMPARE ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(T)axon-specific alleles", 
		*statisticsFlag&SPECIFIC ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(P)airwise-specific alleles", 
		*statisticsFlag&PAIRWISE ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(O)utliers", 
		*statisticsFlag&OUTLIER ? "Yes" : "No");
	if (*statisticsFlag&OUTLIER)
		printf("  %-30s [%d %s]\n", "(C)riterion for outliers", 
			*outlierCriterion, *statisticsFlag&HINGE ? "hinges" : "sigmas");
	printf("  %-30s [%s]\n", "(X)tabulation",
		*statisticsFlag&CROSSTAB ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(L)inearity", 
		*statisticsFlag&LINEARITY ? "Yes" : "No");
	if (*statisticsFlag&LINEARITY) {
		printf("  %-30s [%f]\n", "(R)ate of mutation", *mutationRate);
		if (!*alleleFlag)
			printf("  %-30s [%d]\n", "(E)rror from primer", *primerError);
	}
	printf("  %-30s [%s]\n", "(F)requency distribution", 
		*statisticsFlag&FREQUENCY ? "Yes" : "No");
	printf("  %-30s [%s]\n", "(V)ariability", 
		*statisticsFlag&DIVERSITY ? "Yes" : "No");
	printf("  (?X) Help on option X\n");
	printf("  %-30s\n", " ! Back to main menu");
	gets(buffer);
	n=toupper(buffer[0]);
	switch (n) {
		case '?': help(STATISTICS, toupper(buffer[1])); break;
		case '!': return;
		case 'A': *statisticsFlag^=ANOMALY; break;
		case 'C': 
			if (*statisticsFlag&HINGE)
				printf("Enter number of interhinge ranges (+/-): ");
			else
				printf("Enter number of standard deviations (+/-): ");
			gets(buffer);
			*outlierCriterion=atoi(buffer);
			break;
		case 'E': if (!*alleleFlag) {
			printf("Enter length of primer error: ");
			gets(buffer);
			*primerError=atoi(buffer);
			}
			break;
		case 'F': *statisticsFlag^=FREQUENCY;
			if (*statisticsFlag&FREQUENCY) {
				do {
					printf("%s by (L)ocus or by (T)axon? ", *statisticsFlag&DIVERSITY ? 
						"Frequency and variability" : "Frequency");
					gets(buffer);
					n=toupper(buffer[0]);
				} while (n!='L'&&n!='T');
				if (n=='T')
					*statisticsFlag|=BYTAXON;
				else
					*statisticsFlag&=(BYTAXON^COMPLEMENT);
			}
			break;
		case 'I': *statisticsFlag^=COMPARE; break;
		case 'L': *statisticsFlag^=LINEARITY; break;
		case 'M': *statisticsFlag^=MISSING; break;
		case 'O': n=0;
			do {
				printf("(H)inge checking, (S)igma checking, or (N)one? ");
				gets(buffer);
				n=toupper(buffer[0]);
			} while (n!='H'&&n!='S'&&n!='N');
			switch (n) {
				case 'H':
					*statisticsFlag|=OUTLIER;
					*statisticsFlag|=HINGE;
					*outlierCriterion=DEFAULTHINGE;
					break;
				case 'S': 
					*statisticsFlag|=OUTLIER;
					*statisticsFlag&=(HINGE^COMPLEMENT);
					*outlierCriterion=DEFAULTSIGMA;
					break;
				case 'N':
					*statisticsFlag&=(OUTLIER^COMPLEMENT);
					break;
			}
			break;
		case 'P': *statisticsFlag^=PAIRWISE; break;
		case 'R':
			printf("Enter mutation rate: ");
			gets(buffer);
			*mutationRate=atof(buffer);
			break;
		case 'T': *statisticsFlag^=SPECIFIC; break;
		case 'V': *statisticsFlag^=DIVERSITY;
			if (*statisticsFlag&DIVERSITY) {
				do {
					printf("%s by (L)ocus or by (T)axon? ", *statisticsFlag&FREQUENCY ? 
						"Frequency and variability" : "Variability");
					gets(buffer);
					n=toupper(buffer[0]);
				} while (n!='L'&&n!='T');
				if (n=='T')
					*statisticsFlag|=BYTAXON;
				else
					*statisticsFlag&=(BYTAXON^COMPLEMENT);
			}
			break;
		case 'X': *statisticsFlag^=CROSSTAB; break;
	}
	getStatistics(statisticsFlag, alleleFlag, 
		outlierCriterion, mutationRate, primerError);
}

void getOptions(int *distanceFlag, int *statisticsFlag, int *featureFlag,
	int *outlierCriterion, int *alleleFlag, int *repeatLength, 
	int *primerError, float *mutationRate, int *bootstraps, 
	int *printFlag, char *infileName, char *outfileName) {
	int n;
	char buffer[BUFFSIZE];
	
	printf("\nSelect the option you wish to change:\n");
/* Display data transformation options */
	printf("  %-30s [%s]\n", "(A)llele sizes", 
		*alleleFlag ? "# Repeats" : "# Nucleotides");
	if (!*alleleFlag)
		printf("  %-30s [%d]\n", "(R)epeat length", *repeatLength);
/* Display distance option */
	printf("  %-30s", "(D)istances");
	switch (*distanceFlag) {
		case AD: printf(" [Dad: Absolute difference]\n"); break;
		case AS: printf(" [D1: Average square]\n"); break;
		case DM: 
			if (*featureFlag&ADJUST)
				printf(" [Ddm*: Adjusted delta mu squared]\n");
			else
				printf(" [Ddm: Delta mu squared]\n");
			break;
		case FS:
			if (*featureFlag&LOGARITHM)
				printf(" [Dfs: -ln(Fuzzy similarity)]\n");
			else
				printf(" [Dfs': 1-(Fuzzy similarity)]\n");
			break;
		case KF:
			if (*featureFlag&LOGARITHM)
				if (*featureFlag&ADJUST)
					printf(" [Dkf*: Adjusted -ln(Kinship coefficient)]\n");
				else
					printf(" [Dkf: -ln(Kinship coefficient)]\n");
			else
				if (*featureFlag&ADJUST)
					printf(" [Dkf*': Adjusted 1-(Kinship coefficient)]\n");
				else
					printf(" [Dkf': 1-(Kinship coefficient)]\n");
			break;
		case NI:
			if (*featureFlag&LOGARITHM)
				printf(" [Gst: -ln(Nei's identity)]\n");
			else
				printf(" [Gst': 1-(Nei's identity)]\n");
			break;
		case PS: 
			if (*featureFlag&LOGARITHM)
				printf(" [Dps: -ln(Proportion shared alleles)]\n");
			else
				printf(" [Dps': 1-(Proportion shared alleles)]\n");
			break;
		case RF: printf(" [Fst]\n"); break;
		case SW: printf(" [Dsw: Absolute product]\n"); break;
		case SR: printf(" [Standardized Rst]\n"); break;
		default: *distanceFlag=NO; printf(" [No]\n"); break;
	}
	if (*distanceFlag&&(*bootstraps<=1))
		printf("  %-30s [%s]\n", "(L)ocus-specific distances", 
			*featureFlag&BYLOCUS ? "Yes" : "No");
/* Display statistics */
	printf("  %-30s [%s]\n", "(S)tatistics", *statisticsFlag ? "Yes" : "No");
	if (*statisticsFlag) {
		if (*statisticsFlag&MISSING)
			printf("  %-30s\n", "   Missing data");
		if (*statisticsFlag&ANOMALY)
			printf("  %-30s\n", "   Anomalous frequencies");
		if (*statisticsFlag&COMPARE)
			printf("  %-30s\n", "   Incommensurable taxa");
		if (*statisticsFlag&SPECIFIC)
			printf("  %-30s\n", "   Taxon-specific alleles");
		if (*statisticsFlag&PAIRWISE)
			printf("  %-30s\n", "   Pairwise-specific alleles");
		if (*statisticsFlag&CROSSTAB)
			printf("  %-30s\n", "   Crosstabulation");
		if (*statisticsFlag&OUTLIER)
			printf("  %-30s [%d %s]\n", "   Outliers",
				*outlierCriterion, *statisticsFlag&HINGE ? "hinges" : "sigmas");
		if (*statisticsFlag&LINEARITY)
			printf("  %-30s\n", "   Linearity");
		if (*statisticsFlag&FREQUENCY)
			printf("  %-30s [by %s]\n", "   Frequency distribution",
				*statisticsFlag&BYTAXON ? "taxon" : "locus");
		if (*statisticsFlag&DIVERSITY)
			printf("  %-30s [by %s]\n", "   Variability",
				*statisticsFlag&BYTAXON ? "taxon" : "locus");
	}
/* Display bootstrap options here */
	if (*bootstraps>1)
		if (*distanceFlag)
			printf("  %-30s [%s %d]\n", "(B)ootstraps", 
				*printFlag==ALL ? "All of" : "Mean of", *bootstraps);
		else
			printf("  %-30s [%d]\n", "(B)ootstraps", *bootstraps);
	else 
		if ((*distanceFlag&&!(*featureFlag&BYLOCUS))||*statisticsFlag&DIVERSITY)
			printf("  %-30s [0]\n", "(B)ootstraps");
/* Display I/O options */
	printf("  %-30s [%s]\n", "(I)nput data from file name", infileName);
	printf("  %-30s [%s]\n", "(O)utput results to file name", outfileName);
	printf("  (?X) Help on option X\n");
	printf("  %-30s\n", " ! Ready");
/* Pick up the choice */
	gets(buffer);
	n=toupper(buffer[0]);
	switch (n) {
		case '?': help(MAIN, toupper(buffer[1])); break;
		case '!': 
			if ((*distanceFlag)&&(*bootstraps<=1))
				*printFlag=MEAN;
			if (*alleleFlag) {
				*repeatLength=1;
				*primerError=0;
			}
			return;
		case 'A': *alleleFlag=1- *alleleFlag; break;
		case 'B':
			printf("Enter number of bootstraps: ");
			gets(buffer);
			*bootstraps=atoi(buffer);
			*printFlag=ERROR;
			if (*distanceFlag&&*bootstraps) do {
				printf("Show (A)ll or (M)ean of bootstrapped distances? "); 
				gets(buffer);
				switch (toupper(buffer[0])) {
					case 'A': *printFlag=ALL; break;
					case 'M': *printFlag=MEAN; break;
					default: *printFlag=ERROR;
				}
			} while (!*printFlag);
			break;
		case 'D': getDistance(distanceFlag, featureFlag); break;
		case 'I': printf("Enter name of input file: ");
			gets(buffer);
			strcpy(infileName, buffer);
			break;
		case 'L': *featureFlag^=BYLOCUS; break;
		case 'O': printf("Enter name of output file: ");
			gets(buffer);
			strcpy(outfileName, buffer);
			break;
		case 'R': if (!*alleleFlag) {
			printf("Enter length of repeat: ");
			gets(buffer);
			*repeatLength=atoi(buffer);
			}
			break;
		case 'S': 
			getStatistics(statisticsFlag, alleleFlag, 
				outlierCriterion, mutationRate, primerError); 
			break;
		default:
			help(MAIN, '-');
			break;
	}
	getOptions(distanceFlag, statisticsFlag, featureFlag, outlierCriterion,
		alleleFlag, repeatLength, primerError, mutationRate, bootstraps, printFlag, 
		infileName, outfileName);
}

void main(int argc, char *argv[]) {
	FILE *infile, *outfile;
/* Strings */
	char buffer[BUFFSIZE];
	char *token, *infileName, *outfileName;
/* Flags */
	int distanceFlag, statisticsFlag, alleleFlag, printFlag, frequencyFlag, generateFlag, featureFlag;
/* Scalars */
	int i, j, k, ii, jj, bootstrap;
	int locus, loci, taxon, taxa;
	int character, freq;
	int repeatLength, primerError, bootstraps;
	int min, max, med, mod, repeat, poly;
	int loHinge, hiHinge, outlierCriterion;
	float diff1, tau1tot, tau1avg, tau1, diff2, tau2tot, tau2avg, tau2;
	float Mr, Mravg, Mrtot, Mf, Mfavg, Mftot;
	float mutationRate, minCrit, maxCrit, meanSize, sdSize;
	float varFstVar, varFstHet, varHet , varHetTot, varVar, varVarTot;
	float varAll, varAllTot, varRan, varRanTot, varMax, varMaxTot;
	float totFstVar, totFstHet, totHet, totHetTot, totVar, totVarTot;
	float totAll, totAllTot, totRan, totRanTot, totMax, totMaxTot;
	float varEnt, varEntTot, totEnt, totEntTot, varRstStd, totRstStd;
	float avg, totAvg, totMed, totMod, totMin;
/* Vectors */				
	int *bootLocus;
	float *variance;
/* Matrices */
	float **distance, **meanDistance, **sdDistance;
/* Structures */
	struct Lexicon *taxonNames, *locusNames, *iNames;
	struct Bag ***data, ***rawData;
	struct Bag **locusData, **taxonData, **iData;
	struct Bag *total;

/* Initialize */	
	infile=stdin;
	outfile=stdout;
	mutationRate=DEFAULTMUTATION;
	repeatLength=DEFAULTREPEAT;
	primerError=DEFAULTPRIMER;
	outlierCriterion=ERROR;
	distanceFlag=statisticsFlag=featureFlag=alleleFlag=printFlag=0;
	bootstraps=1;
	infileName=myalloc(BUFFSIZE);
	outfileName=myalloc(BUFFSIZE);
	strcpy(infileName, "stdin");
	strcpy(outfileName, "stdout");
	do {
		getOptions(&distanceFlag, &statisticsFlag, &featureFlag, &outlierCriterion, 
			&alleleFlag, &repeatLength, &primerError, &mutationRate, &bootstraps, 
			&printFlag, infileName, outfileName);
		if (strcmp(infileName, "stdin")) {
			infile=fopen(infileName, "r");
			if (infile==NULL)
				printf("Can't find data file named %s\n", infileName);
		}
		if (strcmp(outfileName, "stdout")) {
			outfile=fopen(outfileName, "w");
			if (outfile==NULL)
				printf("Can't create output file named %s\n", outfileName);
		}
	} while (infile==NULL||outfile==NULL);
/* Allocate some structs */
	taxonNames=initLexicon();
	locusNames=initLexicon();
/* Count the taxa and loci, and build the lexica */
	taxa=loci=0;
	while (fgets(buffer, BUFFSIZE, infile)) 
	if ((buffer[0]!='%')&&fieldCheck(buffer)) {
		token=strtok(buffer, " \t\n");
		taxon=getLexWord(taxonNames, token);
		taxa=MAX(taxon, taxa);
		token=strtok(NULL, " \t\n");
		if (!(locus=findLexWord(locusNames, token))) {
			locus=getLexWord(locusNames, token);
			loci=MAX(locus, loci);
		}
	}
/* Initialize the arrays */
	distance=myalloc(taxa*sizeof(float*));
	meanDistance=myalloc(taxa*sizeof(float*));
	sdDistance=myalloc(taxa*sizeof(float*));
	for (i=0; i<taxa; i++) {
		distance[i]=myalloc(taxa*sizeof(float));
		meanDistance[i]=myalloc(taxa*sizeof(float));
		sdDistance[i]=myalloc(taxa*sizeof(float));
		for (j=0; j<taxa; j++)
			meanDistance[i][j]=sdDistance[i][j]=0;
	}
	locusData=myalloc(loci*sizeof(struct Bag*));
	taxonData=myalloc(taxa*sizeof(struct Bag*));
	for (locus=0; locus<loci; locus++)
		locusData[locus]=initBag();
	for (taxon=0; taxon<taxa; taxon++)
		taxonData[taxon]=initBag();
	data=myalloc(taxa*sizeof(struct Bag**));
	rawData=myalloc(taxa*sizeof(struct Bag**));
	for (taxon=0; taxon<taxa; taxon++) {
		data[taxon]=myalloc(loci*sizeof(struct Bag*));
		rawData[taxon]=myalloc(loci*sizeof(struct Bag*));
		for (locus=0; locus<loci; locus++) {
			data[taxon][locus]=initBag();
			rawData[taxon][locus]=initBag();
		}
	}
	bootLocus=myalloc(loci*sizeof(int));
/* Read in the data */
	rewind(infile);
	while (fgets(buffer, BUFFSIZE, infile)) 
	if ((buffer[0]!='%')&&fieldCheck(buffer)) {
		token=strtok(buffer, " \t");
		taxon=findLexWord(taxonNames, token)-1;
		token=strtok(NULL, " \t");
		locus=findLexWord(locusNames, token)-1;
		character=atoi(strtok(NULL, " \t\n"));
		token=strtok(NULL, " \t\n");
		if (token)
			freq=atoi(token);
		else
			freq=1;
		bump(rawData[taxon][locus], character, freq);
		bump(locusData[locus], character, freq);
		if (freq==0) {
			printf("Warning: taxon %s locus %s allele %d has been assigned a frequency of zero.\nThis will be treated as missing data.\n",
				lexWord(taxonNames, taxon+1), lexWord(locusNames, locus+1), character);
		}
	}
	fclose(infile);

/* Check for bootstrap count */
	if (bootstraps>1) {
		bootstrap=MIN(BIGINT, binomial(2*loci-1, loci));
		generateFlag=0;
/* Query if requested bootstraps are not likely to sample completely */
		k=bootstrap*pow(1.0-1.0/bootstrap, bootstraps);
		if (bootstrap<bootstraps||bootstrap<BIGINT&&k>0) {
			printf("There are only %d possible bootstrap samplings for %d loci.\n",
				bootstrap, loci);
			if (k)
				printf("%d bootstraps will probably include only ~%d of them.\n", 
					bootstraps, bootstrap-k);
			printf("Do you want them generated exhaustively instead? ");
			gets(buffer);
			if (toupper(buffer[0])=='Y') {
				generateFlag=1;
				bootstraps=bootstrap;
			}
		}
	}
/* Check for conforming repeat lengths */
	for (locus=0; locus<loci; locus++) {
		min=bagMinimum(locusData[locus]);
		max=bagMaximum(locusData[locus]);
		repeat=repeatLength;
		if (min==max)
			printf("Warning: locus %s is not polymorphic.\n", 
				lexWord(locusNames, locus+1));
		else if ((repeatLength>1)&&
			(repeat=minimumIncrement(locusData[locus], min, max))!=repeatLength) {
			printf("Warning: locus %s appears to have a repeat length of %d\n",
				lexWord(locusNames, locus+1), repeat);
			showIncrement(locusData[locus], min, max, repeat);
			printf("Treat it as repeat length %d? ", repeat);
			gets(buffer);
			if (toupper(buffer[0])!='Y')
				repeat=repeatLength;
		}
/* Check for short alleles */
		if ((min-primerError)<0) {
			printf("Error: locus %s has alleles shorter than the primer error of %d.\nPrimer error will be decreased to %d.\n",
				lexWord(locusNames, locus+1), primerError, min);
			primerError=min;
		}
/* Convert data to repeat score form */
		for (taxon=0; taxon<taxa; taxon++)
			for (j=min; j<=max; j++)
				bump(data[taxon][locus], 
					(j-primerError)/repeat, frequency(rawData[taxon][locus], j));
/* Erase the raw data and accumulate the totals */
		emptyBag(locusData[locus]);
		for (taxon=0; taxon<taxa; taxon++) {
			addBag(locusData[locus], data[taxon][locus]);
			freeBag(rawData[taxon][locus]);
		}
		
	}
	for (locus=0; locus<loci; locus++)
		for (taxon=0; taxon<taxa; taxon++)
			addBag(taxonData[taxon], data[taxon][locus]);
/* Checking for outliers? */
	if (statisticsFlag&OUTLIER) {
		total=initBag();
		for (locus=0; locus<loci; locus++) {
			emptyBag(total);
			addBag(total, locusData[locus]);
			if (statisticsFlag&HINGE) {
				loHinge=bagLoHinge(total);
				hiHinge=bagHiHinge(total);
				minCrit=loHinge-outlierCriterion*(hiHinge-loHinge);
				maxCrit=hiHinge+outlierCriterion*(hiHinge-loHinge);
			}
			else {
				meanSize=bagMean(total);
				sdSize=sqrt(bagVariance(total));
				minCrit=meanSize-outlierCriterion*sdSize;
				maxCrit=meanSize+outlierCriterion*sdSize;
			}
			min=bagMinimum(total);
			max=bagMaximum(total);
			for (j=min; j<=max; j++)
				if (j>minCrit&&j<maxCrit)
					empty(total, j);
			if (bagCardinality(total)) {
				fprintf(outfile, "Locus %s has outlier alleles of size", 
					lexWord(locusNames, locus+1));
				min=bagMinimum(total);
				max=bagMaximum(total);
				for (j=min; j<=max; j++)
					if (frequency(total, j)) {
						fprintf(outfile, " %d", j);
						for (taxon=0; taxon<taxa; taxon++)
							empty(data[taxon][locus], j);
						empty(locusData[locus], j);
					}
				fprintf(outfile, "\n");
			}
		}
		freeBag(total);
	}
/* Checking for missing data (empty cells)? */
	if (statisticsFlag&MISSING) {
		frequencyFlag=0;
		fprintf(outfile, "\nMissing data:\n");
		for (locus=0; locus<loci; locus++)
			for (taxon=0; taxon<taxa; taxon++)
				if ((k=bagFrequency(data[taxon][locus]))==0) {
					if (!frequencyFlag)
						fprintf(outfile, "%-15s%-15s\n", "Locus", "Taxon");
					frequencyFlag=1;
					fprintf(outfile, "%-15s%-15s\n",
						lexWord(locusNames, locus+1), lexWord(taxonNames, taxon+1));
				}
		if (!frequencyFlag)
			fprintf(outfile, "None\n");
	}
/* Checking for odd frequencies? */
	if (statisticsFlag&ANOMALY) {
		frequencyFlag=0;
		fprintf(outfile, "\nOdd frequencies:\n");
		for (locus=0; locus<loci; locus++)
			for (taxon=0; taxon<taxa; taxon++)
				if (((k=bagFrequency(data[taxon][locus]))%2)==1) {
					if (!frequencyFlag)
						fprintf(outfile, "%s %-15s%-15s\n", "Number", "Locus", "Taxon");
					frequencyFlag=1;
					fprintf(outfile, "%6d %-15s%-15s\n", k,
						lexWord(locusNames, locus+1), lexWord(taxonNames, taxon+1));
				}
		if (!frequencyFlag)
			fprintf(outfile, "None\n");
	}
/* Checking for incommensurable taxon pairs? */
	if (statisticsFlag&COMPARE) {
		frequencyFlag=0;
		fprintf(outfile, "\nPairs with no loci in common:\n");
		for (i=1; i<taxa; i++)
			for (j=0; j<i; j++) {
				k=0;
				for (locus=0; locus<loci; locus++)
					k+=bagCardinality(data[i][locus])*bagCardinality(data[j][locus]);
				if (!k) {
					frequencyFlag=1;
					fprintf(outfile, "%-15s%-15s\n",
						lexWord(taxonNames, i+1), lexWord(taxonNames, j+1));
				}
			}
		if (!frequencyFlag)
			fprintf(outfile, "None\n");
	}
/* Checking for taxon-specific alleles? */
	if (statisticsFlag&SPECIFIC) {
		frequencyFlag=0;
		fprintf(outfile, "\nTaxon-specific alleles:\n");
		for (locus=0; locus<loci; locus++) {
			min=bagMinimum(locusData[locus]);
			max=bagMaximum(locusData[locus]);
			for (i=0; i<taxa; i++)
				for (j=min; j<=max; j++)
					if (k=frequency(data[i][locus], j))
						if (k==frequency(locusData[locus], j)) {
							if (!frequencyFlag)
								fprintf(outfile, "%10s%6s%14s\n", 
									"Locus", "Allele", "Taxon");
							frequencyFlag=1;
							fprintf(outfile, "%10.10s%6d%14s\n",
								lexWord(locusNames, locus+1), j, lexWord(taxonNames, i+1));
						}
		}
		if (!frequencyFlag)
			fprintf(outfile, "None\n");
	}
/* Frequency of pairwise taxon-specific alleles? */
	if (statisticsFlag&PAIRWISE) {
		fprintf(outfile, "\nFrequency of pairwise taxon-specific alleles:\n");
		fprintf(outfile, "%10s%10s%10s\n", "Locus", "Frequency", "Percent");
		for (locus=0; locus<loci; locus++) {
			min=bagMinimum(locusData[locus]);
			max=bagMaximum(locusData[locus]);
			freq=0;
			for (i=1; i<taxa; i++)
				for (j=0; j<i; j++)
					for (k=min; k<=max; k++)
						freq+=((frequency(data[i][locus], k)>0)^
							(frequency(data[j][locus], k)>0));
			fprintf(outfile, "%10.10s%10d%10.3f\n", lexWord(locusNames, locus+1), 
				freq, 50.0*freq/((taxa*taxa-taxa)*bagCardinality(locusData[locus])));
		}
	}
/* Want distances? */
	if (distanceFlag) {
/* If distances are wanted by locus, hack some parameters */
		if (featureFlag&BYLOCUS) {
			bootstraps=loci;
			printFlag=ALL;
		}
/* Rst is a special case: we have to pass the variance for each locus  */
		if (distanceFlag==SR) {
			variance=myalloc(loci*sizeof(float));
			for (locus=0; locus<loci; locus++) {
				variance[locus]=bagVariance(locusData[locus]);
			}
		}
/* Loop through the bootstraps */
		for (bootstrap=0; bootstrap<bootstraps; bootstrap++) {
/* Sample loci with replacement */
			if (bootstraps>1)
				if (generateFlag)
					if (bootstrap) {
						i=0;
						do {
							bootLocus[i]=(bootLocus[i]+1)%loci;
						} while (!bootLocus[i++]);
						locus=bootLocus[i-1];
						do {
							bootLocus[--i]=locus;
						} while (i);
					}
					else
						for (locus=0; locus<loci; locus++)
							bootLocus[locus]=0;
				else 
					if (featureFlag&BYLOCUS)
						bootLocus[0]=bootstrap;
				else for (locus=0; locus<loci; locus++)
					bootLocus[locus]=iran(loci);
			else
				for (locus=0; locus<loci; locus++)
					bootLocus[locus]=locus;
/* Initialize and build the distance matrix */
			for (i=0; i<taxa; i++)
				for (j=0; j<taxa; j++)
					distance[i][j]=0;
			switch (distanceFlag) {
				case DM: 
					DMDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case PS: 
					PSDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case KF: 
					KFDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case FS: 
					FSDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case NI: 
					NIDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case AD: 
					ADDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case SW: 
					SWDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case RF: 
					RFDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case AS: 
					ASDistance(taxa, loci, data, distance, bootLocus, featureFlag); 
					break;
				case SR: 
					SRDistance(taxa, loci, data, distance, variance, 
						bootLocus, featureFlag); 
					break;
			}
/* Either print the matrix... */
			if (printFlag==ALL) {
				if (featureFlag&BYLOCUS)
					fprintf(outfile, "%d locus=%s\n", 
						taxa, lexWord(locusNames, bootstrap+1));
				else
					fprintf(outfile, "%d\n", taxa);
				for (i=0; i<taxa; i++) {
					fprintf(outfile, "%10.10s", lexWord(taxonNames, i+1));
					for (j=0; j<i; j++)
						fprintf(outfile, " %.3f", distance[i][j]);
					fprintf(outfile, "\n");
				}
			}
/* ...or accumulate distances */
			else for (i=0; i<taxa; i++)
				for (j=0; j<taxa; j++) {
					meanDistance[i][j]+=distance[i][j];
					sdDistance[i][j]+=distance[i][j]*distance[i][j];
				}
		}
/* Get means and maybe standard deviations */
		for (i=0; i<taxa; i++)
			for (j=0; j<taxa; j++) {
				meanDistance[i][j]/=bootstraps;
				if (bootstraps>1)
					sdDistance[i][j]=sqrt((sdDistance[i][j]-
						bootstraps*meanDistance[i][j]*meanDistance[i][j])/(bootstraps-1));
			}
/* Print distance matrix in PHYLIP format */
		if (printFlag==MEAN) {
			fprintf(outfile, "%d\n", taxa);
			for (i=0; i<taxa; i++) {
				fprintf(outfile, "%10.10s", lexWord(taxonNames, i+1));
				for (j=0; j<i; j++)
					fprintf(outfile, " %.3f", meanDistance[i][j]);
				fprintf(outfile, "\n");
			}
/* Maybe print standard error matrix */
			if (bootstraps>1) {
				fprintf(outfile, "\nStandard errors of distances:\n");
				for (i=0; i<taxa; i++) {
					fprintf(outfile, "%10.10s", lexWord(taxonNames, i+1));
					for (j=0; j<i; j++)
						fprintf(outfile, " %.3f", sdDistance[i][j]);
					fprintf(outfile, "\n");
				}
			}
		}
	}
/* Want estimates for linearity? */
	if (statisticsFlag&LINEARITY) {
		tau1avg=tau2avg=tau1tot=tau2tot=Mravg=Mfavg=Mrtot=Mftot=ii=0;
		fprintf(outfile, "\nLinearity estimates:\n");
		fprintf(outfile, "%15s%13s%15s\n", "Locus", "tau(range)", "tau(maximum)");
		for (locus=0; locus<loci; locus++) 
			if (bagCardinality(locusData[locus])>1) {
			ii++;
			diff1=bagMaximum(locusData[locus])-bagMinimum(locusData[locus]);
			diff2=bagMaximum(locusData[locus]);
			tau1avg+=diff1;
			tau2avg+=diff2;
			tau1=TAU(diff1);
			tau2=TAU(diff2);
			tau1tot+=tau1;
			tau2tot+=tau2;
			jj=0;
			for (taxon=0; taxon<taxa; taxon++)
				if (bagFrequency(data[taxon][locus]))
					jj++;
			Mr=rst(taxa, locus, data, locusData[locus]);
			Mf=fst(taxa, locus, data, locusData[locus]);
			Mravg+=Mr;
			Mfavg+=Mf;
			Mr=(jj-1)*(1/Mr-1)/(4*jj);
			Mf=(1/Mf-1)/4;
			Mrtot+=Mr;
			Mftot+=Mf;
			fprintf(outfile, "%15s%13.0f%15.0f\n", 
				lexWord(locusNames, locus+1), tau1, tau2);
		}
		fprintf(outfile, "%-15s%13.0f%15.0f\n", 
			"By average", TAU(tau1avg/ii), TAU(tau2avg/ii));
		fprintf(outfile, "%-15s%13.0f%15.0f\n", 
			"Average", tau1tot/ii, tau2tot/ii);
	}
/* Want a crosstabulation of locus by taxon frequencies? */
	if (statisticsFlag&CROSSTAB) {
		fprintf(outfile, "\nCrosstabulation:\n");
/* Find the longest taxon name */
		max=0;
		for (taxon=0; taxon<taxa; taxon++)
			if ((min=strlen(lexWord(taxonNames, taxon+1)))>max)
				max=min;
/* Print the taxon names vertically */
		for (i=0; i<max; i++) {
			fprintf(outfile, "%10s", " ");
			for (taxon=0; taxon<taxa; taxon++) {
				token=lexWord(taxonNames, taxon+1);
				if (i<strlen(token))
					fprintf(outfile, "%3c", token[i]);
				else
					fprintf(outfile, "   ");
			}
			fprintf(outfile, "\n");
		}
		for (locus=0; locus<loci; locus++) {
			fprintf(outfile, "%10.10s", lexWord(locusNames, locus+1));
			for (taxon=0; taxon<taxa; taxon++)
				fprintf(outfile, "%3d", bagFrequency(data[taxon][locus]));
			fprintf(outfile, "%6d\n", bagFrequency(locusData[locus]));
		}
		fprintf(outfile, "%10s", " ");
		for (taxon=0; taxon<taxa; taxon++)
			fprintf(outfile, "%3d", bagFrequency(taxonData[taxon]));
		fprintf(outfile, "\n");
	}
/* Want the measures of central tendency and the frequency distribution? */
	if (statisticsFlag&FREQUENCY) {
		fprintf(outfile, "\nFrequency distribution:\n");
		if (statisticsFlag&BYTAXON) {
			ii=taxa;
			iNames=taxonNames;
			iData=taxonData;
		}
		else {
			ii=loci;
			iNames=locusNames;
			iData=locusData;
		}
		fprintf(outfile, "%10s%9s%9s%9s%9s%9s  %s\n", 
			statisticsFlag&BYTAXON ? "Taxon" : "Locus",
			"Mean", "Median", "Mode", "Minimum", "Maximum", "Frequencies");
		totAvg=totMed=totMod=totMin=totMax=0;
		for (i=0; i<ii; i++) {
			avg=bagMean(iData[i]);
			med=bagMedian(iData[i]);
			mod=bagMode(iData[i]);
			min=bagMinimum(iData[i]);
			max=bagMaximum(iData[i]);
			fprintf(outfile, "%10.10s%9.3f%9d%9d%9d%9d", 
				lexWord(iNames, i+1), avg, med, mod, min, max);
			for (j=min; j<=max; j++)
				fprintf(outfile, "%6d", frequency(iData[i], j));
			fprintf(outfile, "\n");
			totAvg+=avg;
			totMed+=med;
			totMod+=mod;
			totMin+=min;
			totMax+=max;
		}
		totAvg/=ii; totMed/=ii; totMod/=ii; totMin/=ii; totMax/=ii; 
		fprintf(outfile, "%-10s%9.3f%9.3f%9.3f%9.3f%9.3f\n", 
			"Average", totAvg, totMed, totMod, totMin, totMax);
	}
/* Want indices of variability? */
	if (statisticsFlag&DIVERSITY) {
		fprintf(outfile, "\nDiversity indices:\n");
		if (statisticsFlag&BYTAXON) {
			ii=taxa;
			jj=loci;
			iNames=taxonNames;
			iData=taxonData;
		}
		else {
			ii=loci;
			jj=taxa;
			iNames=locusNames;
			iData=locusData;
		}
		fprintf(outfile, 
			"%10s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s%9s\n",
			statisticsFlag&BYTAXON ? "Taxon" : "Locus",
			"Fst Var", "Fst Het", "Rst Std", "Avg Het", "Tot Het", "Avg Var", 
			"Tot Var", "Avg All", "Tot All", "Avg Ran", "Tot Ran", "Avg Max", 
			"Tot Max", "Avg Ent", "Tot Ent");
		totFstVar=totFstHet=totRstStd=totHet=totHetTot=totVar=totVarTot=0;
		totAll=totAllTot=totRan=totRanTot=totMax=totMaxTot=poly=0;
		totEnt=totEntTot=0;
		for (i=0; i<ii; i++) 
			if (bagCardinality(iData[i])>1) {
				poly++;
				varHet=varVar=varAll=varRan=varMax=varEnt=k=0;
				for (j=0; j<jj; j++) {
					if (statisticsFlag&BYTAXON) {
						taxon=i;
						locus=j;
					}
					else {
						taxon=j;
						locus=i;
					}
					if (bagFrequency(data[taxon][locus])) {
						varVar+=bagVariance(data[taxon][locus]);
						varHet+=heterozygosity(data[taxon][locus]);
						varAll+=bagCardinality(data[taxon][locus]);
						varRan+=bagMaximum(data[taxon][locus])-
							bagMinimum(data[taxon][locus]);
						varMax+=bagMaximum(data[taxon][locus]);
						varEnt+=continuousRelativeBagEntropy(data[taxon][locus]);
						k++;
					}
				}
				if (statisticsFlag&BYTAXON) {
					varFstVar=0;
					varRstStd=0;
				}
				else {
					varFstVar=fst(jj, i, data, iData[i]);
					varRstStd=rst(jj, i, data, iData[i]);
				}
				varHet/=k;
				varVar/=k;
				varAll/=k;
				varRan/=k;
				varMax/=k;
				varEnt/=k;
				varHetTot=heterozygosity(iData[i]);
				varVarTot=bagVariance(iData[i]);
				varFstHet=1-varHet/varHetTot;
				varAllTot=bagCardinality(iData[i]);
				varRanTot=bagMaximum(iData[i])-bagMinimum(iData[i]);
				varMaxTot=bagMaximum(iData[i]);
				varEntTot=continuousRelativeBagEntropy(iData[i]);
				fprintf(outfile, 
					"%10.10s%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.0f%9.3f%9.0f%9.3f%9.0f%9.3f%9.3f\n", 
					lexWord(iNames, i+1), varFstVar, varFstHet, varRstStd, varHet, 
					varHetTot, varVar, varVarTot, varAll, varAllTot, varRan, 
					varRanTot, varMax, varMaxTot, varEnt, varEntTot);
				totFstVar+=varFstVar;
				totFstHet+=varFstHet;
				totRstStd+=varRstStd;
				totHet+=varHet;
				totHetTot+=varHetTot;
				totVar+=varVar;
				totVarTot+=varVarTot;
				totAll+=varAll;
				totAllTot+=varAllTot;
				totRan+=varRan;
				totRanTot+=varRanTot;
				totMax+=varMax;
				totMaxTot+=varMaxTot;
				totEnt+=varEnt;
				totEntTot+=varEntTot;
			}
		totFstVar/=poly; totFstHet/=poly; totRstStd/=poly;
		totHet/=poly; totHetTot/=poly; totVar/=poly; 
		totVarTot/=poly; totAll/=poly; totAllTot/=poly; 
		totRan/=poly; totRanTot/=poly; totMax/=poly; totMaxTot/=poly;
		totEnt/=poly; totEntTot/=poly;
		fprintf(outfile, 
			"%-10s%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f%9.3f\n", 
			"Average", totFstVar, totFstHet, totRstStd, totHet, totHetTot, totVar, 
			totVarTot, totAll, totAllTot, totRan, totRanTot, totMax, totMaxTot, totEnt, totEntTot);
	}
	fclose(outfile);
}
