///////////////////////////////////////////////////////////////////
//
//		Functions used by e-PCR to read FASTA files. 
//
///////////////////////////////////////////////////////////////////
// $Id: fasta-io.cpp,v 1.1 2003/03/31 21:08:27 rotmistr Exp $
///////////////////////////////////////////////////////////////////

#include "fasta-io.h"
#include "gdsutil.h"


char chrValidAa[] = "ABCDEFGHIKLMNPQRSTUVWXZ-*";
char chrValidNt[] = "TCAGNBDHKMRSVWY-";  
char stdCode[] = "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";

static char no_name[] = "????????";
static char no_title[] = "NO TITLE";
static char no_defline[] = "???????? NO TITLE";


/*

  want to add the ability to write mrna, CDS, and protein FASTA files

    name "Standard" ,
  name "SGC0" ,
  id 1 ,
  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
  sncbieaa "---M---------------M---------------M----------------------------"
  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG


*/


///////////////////////////////////////////////////////////////
//
//
//		class FastaSeq
//
//

FastaSeq::FastaSeq ()
{
	m_tag = NULL;
	m_acc = NULL;
	m_def = NULL;
	m_seq = NULL; 
	m_len = 0;
}


FastaSeq::FastaSeq (char *def, char *seq)
{
	m_tag = NULL;
	m_acc = NULL;
	m_def = NULL;
	m_seq = NULL; 
	SetDefline(def);
	SetSequence(seq);
}


FastaSeq::~FastaSeq ()
{
	Clear(); 
}


const char * FastaSeq::Label() const
{
	return (m_tag==NULL) ? no_name : m_tag; 
}


const char * FastaSeq::Accession() const
{
	return (m_acc==NULL) ? no_name : m_acc; 
}


const char * FastaSeq::Defline () const
{
	return (m_def==NULL) ? no_defline : m_def; 
}


const char * FastaSeq::Title () const
{
	if (m_def==NULL) return no_title;
	char *p;
	if (p = strchr(m_def, ' '))
	{
		while (*p && isspace(*p))  p++;
		if (*p) return p;
	}
	return m_def; 
}


const char * FastaSeq::Sequence () const
{
	return m_seq; 
}


int FastaSeq::Length() const
{
	return m_len;
}


void FastaSeq::SetDefline (char *string)
{
	MemDealloc(m_tag);
	MemDealloc(m_acc);
	MemDealloc(m_def);
	m_def = string;
	if (string)
	{
		char *p = string;
		if (*p == ' ') p++;
		int n = strcspn(p," ");
		if (MemAlloc(m_tag,1+n))
		{
			memcpy((void*)m_tag,p,n);
			m_tag[n] = 0;
			if (strncmp(m_tag,"gi|",3) ==0)
			{
				p = m_tag+3;
				while (*p && *p != '|')  p++;
				p++;
				while (*p && *p != '|')  p++;
				p++;
				n = strlen(p);
				MemAlloc(m_acc,1+n);
				memcpy((void*)m_acc,p,n);
				m_acc[n] = 0;
				p = m_acc;
				while (*p)
				{
					if (*p == '|') { *p=0; break; }
					p++;
				}
			}
			else
			{
				MemAlloc(m_acc,1+n);
				strcpy(m_acc,m_tag);
			}
		}
	}
}


void FastaSeq::SetSequence (char *string)
{
	MemDealloc(m_seq); 
	m_seq = string; 
	m_len = string ? strlen(string) : 0;
}


void FastaSeq::Clear()
{
	SetDefline(NULL); 
	SetSequence(NULL); 
}


bool FastaSeq::ParseText (const char *text, int upper, const char *alphabet)
{
	if (text==NULL || *text==0)
		return FALSE;

	
	const char *p2 = text;
	char *defline = NULL;
	char *sequence = NULL;
	if (*p2 == '>')
	{
		int n = strcspn((char*)text,"\r\n");
		p2 = text + n;
		n--;
		const char *p = text+1;
		while (p<p2 && isspace(*p))
		{
			p++;
			n--;
		}
		if (n>0 && MemAlloc(defline,1+n))
		{
			memcpy((void*)defline,(void*)p,n);
			defline[n] =0;
		}
	}
	
	if (MemAlloc(sequence,1+strlen(p2)))
	{
		char *p1 = sequence;
		char chr, chr2;
		while (*p2)
		{
			chr = *p2;
			if (isalpha(chr))
			{
				chr2 = toupper(chr);
				if (upper) 
					chr = chr2;
				if (strchr(alphabet,chr2))
					*p1++ = chr;
			}
			p2++;
		}
		*p1 = 0;		
		SetDefline(defline);
		SetSequence(sequence);
	}
	//return (m_len==0) ? FALSE : TRUE;
	return TRUE;
}


bool FastaSeq::ParseText (const char *text, int upper, int seqtype)
{
	if (seqtype == SEQTYPE_AA)
		return ParseText(text,upper,chrValidAa);

	if (seqtype == SEQTYPE_NT)
		return ParseText(text,upper,chrValidNt);

	PrintError("ParseText(text,code);   ERROR: Invalid code");
	return FALSE;
}



///////////////////////////////////////////////////////////////
//
//
//		class FastaFile
//
//

struct sidx
{
	char *acc;
	fpos_t fpos;
};


FastaFile::FastaFile ()
{
	m_file_name = NULL;
	m_file = NULL;
	m_seqtype = SEQTYPE_NT;
	m_index = NULL;
	m_count = 0;
	m_toupper = 0;
}


FastaFile::FastaFile (int seqtype)
{
	m_file_name = NULL;
	m_file = NULL;
	m_seqtype = seqtype;
	m_index = NULL;
	m_count = 0;
	m_toupper = 0;
}


FastaFile::~FastaFile ()
{
	if (m_file)  Close();
}


bool FastaFile::Open (const char *fname, const char *fmode)
{
	if (!fmode) return FALSE;

	if (m_file != NULL)
	{
		PrintError("FastaFile::Open();  WARNING: file already open");
		return FALSE;
	}

	if (fname)
	{
		m_file = fopen(fname,fmode);
	}
	else
	{
		m_file = (*fmode == 'w') ? stdout : stdin;
	}
	
	if (m_file == NULL)
	{
		PrintError("FastaFile::Open();  ERROR: unable to open file");
		return FALSE;
	}

	m_file_name = newstring(fname);
	return TRUE;
}


bool FastaFile::Close ()
{
	if (m_file == NULL)
	{
		PrintError("FastaFile::Close();  WARNING: file already closed");
		return FALSE;
	}

	if (m_file != stdin && m_file != stdout)  fclose(m_file);

	m_file = NULL;
	delete m_file_name;
	m_file_name =NULL;
	return TRUE;
}



int FastaFile::Attach (FILE *file)
{
	m_file = file;
	return 1;
}


FILE* FastaFile::Detach ()
{
	FILE *f = m_file;
	m_file = NULL;
	return f;
}


/****

long FastaFile::Tell ()
{
	return ftell(m_file);
}


int  FastaFile::Seek (long pos, int whence)
{
	if (fseek(m_file,pos,whence) !=0)
	{
		fprintf(stderr,"FastaFile::Seek() ERROR\n");
		return 0;
	}
	return 1;
}

****/


int  FastaFile::SetFilePos (fpos_t &fpos)
{
	if (fsetpos(m_file,&fpos) !=0)
	{
		fprintf(stderr,"FastaFile::SetFilePos() ERROR\n");
		return 0;
	}
	return 1;
}


int  FastaFile::GetFilePos (fpos_t &fpos)
{
	if (fgetpos(m_file,&fpos) !=0)
	{
		fprintf(stderr,"FastaFile::GetFilePos() ERROR\n");
		return 0;
	}
	return 1;
}


extern "C" int sidx_cmp (const void *ptr1, const void *ptr2)
{
	struct sidx *s1 = (struct sidx*) ptr1;
	struct sidx *s2 = (struct sidx*) ptr2;
	return strcmp(s1->acc,s2->acc);
}


int  FastaFile::BuildSeqIndex ()
{
	int slots = 1024;
	int cnt = 0;
	sidx *arr;

	MemAlloc(arr,slots*sizeof(struct sidx));

	char tmp1[48];
	char tmp2[16];
	FastaSeq faseq;
	fpos_t fpos;
	if (!GetFilePos(fpos)) fprintf(stderr,"screwed\n");

	while (Read(faseq))
	{
		if (cnt >= slots-1)
		{
			slots *= 2;
			if (!MemResize(arr,slots*sizeof(struct sidx)))
			{
				fprintf(stderr,"Out of memory\n");
				exit(0);
			}
		}

		strcpy(tmp1,faseq.SeqIdent());
		ExtractSeqAcc(tmp1,tmp2);

		arr[cnt].acc = newstring(tmp1);
		arr[cnt].fpos = fpos;
		cnt++;

		if (strcmp(tmp1,tmp2)!=0)
		{
		arr[cnt].acc = newstring(tmp2);
		arr[cnt].fpos = fpos;
		cnt++;
		}

		if (!GetFilePos(fpos)) 
			fprintf(stderr,"FastaFile::BuildSeqIndex() -- can't get file offset\n");
	}

	m_count = cnt;
	m_index = (void*)arr;

	qsort(m_index,m_count,sizeof(struct sidx),sidx_cmp);

	return m_count;
}



int FastaFile::GetSeqOffset (const char *acc, fpos_t &fpos)
{
	if (!acc) return 0;

	if (!m_index)
	{
		fprintf(stderr,"FastaFile::GetSeqOffset() -- no index (programmer error)\n");
		return 0;
	}

	sidx srec;
	srec.acc = (char*)acc;
	sidx *s = (struct sidx*) bsearch((void*)&srec,m_index,m_count,sizeof(struct sidx),sidx_cmp);
	if (s)
	{
		fpos = s->fpos;
		return 1;
	}
	return 0;
}



#define CHUNK 10000


bool FastaFile::Read (FastaSeq *&faseq)
{
	faseq = new FastaSeq();
	return Read(*faseq);
}


bool FastaFile::Read (FastaSeq &faseq)
{
	faseq.Clear();

	if (!IsOpen())
		return FALSE;

	char line[2000];

	if (!fgets(line, sizeof line, m_file))
		return FALSE;  

	if (line[0] != '>')
	{
		PrintError("FastaFile::Read();  ERROR: was expecting '>'");
		return FALSE;
	}

	int len = strlen(line);
	int buf_total = 1 + len + CHUNK;
	char *buf;

	if (!MemAlloc(buf,buf_total))
		return FALSE;
	
	strcpy(buf,line);
	int buf_used = len;
	char *p = buf+len;

	//long offset = ftell(m_file);
	fpos_t fpos;
	if (!GetFilePos(fpos))  fprintf(stderr,"screwed\n");

	while (fgets(line, sizeof line, m_file))
	{
		if (line[0] == '>')
		{
			if (!SetFilePos(fpos)) { fprintf(stderr,"screwed\n"); }
			//fseek(m_file,offset,SEEK_SET);
			break;
		}
		len = strlen(line);
		if (buf_used +len +1 > buf_total)
		{
			buf_total += CHUNK;
			if (!MemResize(buf,buf_total))
			{
				PrintError("Out of memory");
				MemDealloc(buf);
				return FALSE;
			}
			p = buf + buf_used;
		}
		strcpy(p,line);
		buf_used += len;
		p += len;
		//offset = ftell(m_file);
		if (!GetFilePos(fpos))  fprintf(stderr,"screwed\n");
	}

	bool result = faseq.ParseText(buf,m_toupper,m_seqtype);

	MemDealloc(buf);
	return result;
}


int FastaFile::Write (FastaSeq &seq)
{
	if (!IsOpen())
		return 0;

	fprintf(m_file,">%s\n", seq.Defline());
	WriteSeqLines(m_file,seq.Sequence(),seq.Length());
	return 1;
}


int WriteSeqLines (FILE *fd, const char *seq, int len, int linelen)
{
	char line[1+SEQLINE_LEN_MAX], *p1, ch;
	const char *p2 =seq;
	int i;

	///// force linelen to valid range
	if (linelen > SEQLINE_LEN_MAX)
		linelen = SEQLINE_LEN_MAX;
	else if (linelen < SEQLINE_LEN_MIN)
		linelen = SEQLINE_LEN_MIN;


	while (len >0)
	{
		p1 = line;
		for (i=0; i<len && i<linelen; ++i)
		{
			if ((ch = *p2++) ==0)
				break;
			*p1++ = ch;
		}
		*p1 = 0;
		if (fprintf(fd,"%s\n",line) <0)
			return 0;;
		len -= i;
	}
	return 1;
}



///////////////////////////////////////////////////////////////////////////////

char * ExtractSeqAcc (const char *seqid, char *acc)
{
	*acc = 0;

	const char *p;

	if (	(p=strstr(seqid,"lcl|")) ||
			(p=strstr(seqid,"ref|")) ||
			(p=strstr(seqid,"emb|")) ||
			(p=strstr(seqid,"dbj|")) )
		p += 4;
	else if ( p=strstr(seqid,"gb|") )
		p += 3;
	else 
		return NULL;

	/*****
	const char *p = seqid;
	if (strncmp(p,"gi|",3)==0)  
	{
		if (p=strchr(p+3,'|')) p++;
		else return NULL;
	}
	else if (strncmp(p,"lcl|",4)==0 || strncmp(p,"ref|",4)==0)
	{
		p += 4;
	}
	*****/

	char *p2 = acc;
	while (*p && *p !='|')  *p2++ = *p++;
	*p2 =0;
	return acc;
}


char * ExtractSeqName (const char *seqid, char *name)
{
	*name = 0;

	const char *p = seqid;

	if (	(p=strstr(seqid,"lcl|")) ||
			(p=strstr(seqid,"ref|")) ||
			(p=strstr(seqid,"emb|")) ||
			(p=strstr(seqid,"dbj|")) )
		p += 4;
	else if ( p=strstr(seqid,"gb|") )
		p += 3;
	else
	{
		strcpy(name,seqid);
		return name;
	}

	if (! (p = strchr(p,'|')) ) return NULL;
	p++;

	char *p2 = name;
	while (*p && *p != '|')  *p2++ = *p++;
	*p2 = 0;
	return name;
}


///////////////////////////////////////////////////////////////////////////////


/* initialize for computation of reverse complements */
static char compbase[128];

void init_revcomp()
{
	compbase['A'] = 'T';
	compbase['C'] = 'G';
	compbase['G'] = 'C';
	compbase['T'] = 'A';

	/* ambiguity code */
	compbase['B'] = 'V';	/* B = C, G or T */
	compbase['D'] = 'H';	/* D = A, G or T */
	compbase['H'] = 'D';	/* H = A, C or T */
	compbase['K'] = 'M';	/* K = G or T */
	compbase['M'] = 'K';	/* M = A or C */
	compbase['N'] = 'N';	/* N = A, C, G or T */
	compbase['R'] = 'Y';	/* R = A or G (purines) */
	compbase['S'] = 'S';	/* S = C or G */
	compbase['V'] = 'B';	/* V = A, C or G */
	compbase['W'] = 'W';	/* W = A or T */
	compbase['X'] = 'X';	/* X = A, C, G or T */
	compbase['Y'] = 'R';	/* Y = C or T (pyrimidines) */
}

/* form the reverse complement of a sequence */
void revcomp(char *s, int length)
{
	char *t, save;

	for (t = s + length - 1; s <= t; ++s, --t) {
		save = *s;
		*s = compbase[*t];
		*t = compbase[save];
	}
}




int seq_hash (long &hash, const char *seq, int len)
{
	if (seq==NULL) return 0;
	if (len <= 0) len = strlen(seq);

	const char *p = seq;

	char ch;
	unsigned long b, h = 0;

	int i;

	for (i=0; i<len; i++)
	{
		h <<= 2;                     // left-shift by 2 bits

		ch = *p++;                   // get the next base, convet to 2 bit code
		if (ch=='C')      b = 1;     // C -> 1  (binary 01)
		else if (ch=='A') b = 2;     // A -> 2  (binary 10)
		else if (ch=='G') b = 3;     // G -> 3  (binary 11)
		else ch = 0;                 // T -> 0  (binary 00) and any other character
	
		h |= b;                      // OR the 2-bit base code at positions 1 and 2

		b = (h & 0xC0000000) >> 30;  // extract bits 31 and 32
		h ^= b;                      // XOR those bits into positions 1 and 2
		h &= 0x3FFFFFFF;             // clear bits 31 and 32
	}

	
	hash |= 0x40000000;              // set bit 31 (so valid hash can never be zero)
	hash = (long)h;

	return 1;
}



/*
 * $Log: fasta-io.cpp,v $
 * Revision 1.1  2003/03/31 21:08:27  rotmistr
 * Imported to CVS
 * Compilable with gcc
 *
 */
