/* This is a lex(1) file, see http://dinosaur.compilertools.net/
	* or http://en.wikipedia.org/wiki/Lex_programming_tool .
	*
	* Compilation on UNIX systems is done by
	*     make risDateAdj
	* On other systems one may need to call lex or flex and cc explicitly:
	*     lex -8 -o risDateAdj.c risDateAdj.l
	*     cc [-std=gnu99] [-s] [-O] -o risDateAdj risDateAdj.c -ll
	*
	* The executable works as a filter and patches the Y1, Y2 and PY fields of RIS
	* files (the standard input) from a
	* Apr., YYYY
	* type produced for example by JSTOR or a
	* YYYY-MM-DD
	* or a
	* YYYY
	* or a
	* YYYY/DD
	* type to the standard defined in http://www.refman.com/support/risformat_intro.asp .
	*
	* TODO:
	* ingentaconnect produces lines PY  -- ///January 2009
	* which also fall into the category of faulty but recoverable inputs.
	* And this ought also be implemented.
	* Richard J. Mathar, 2009-02-11
	*/
%{
#include <strings.h>
#include <sys/types.h>
#include <regex.h>

#define Y1PATCH_PMAT_SIZ 7

regex_t preg1,
	preg2 ,
	preg3 ,
	preg4 ;
regmatch_t pmat[Y1PATCH_PMAT_SIZ] ;

/* Fit noman monam to integer in the range 1 to 12.
* @return Return a number from 1 to 12 if the month is recognized, 0 if not.
*/
int name2Month(const char *monam)
{
	static const char *jan2dec[] = {"jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"} ;
	for(int i=0; i < 12 ; i++)
		if( strncasecmp(monam,jan2dec[i],3) == 0 )
			return i+1 ;
	return 0 ;
}

/* Generate a line YYY/MM/DD/otherinfo line on output.
* @param tagline the original line, including the tag and the <CR></LF>
*/
void y1patch(const char *yytext)
{
		/* the original tag including the mandatory white space copied thru
		*/
		fprintf(yyout,"%.6s",yytext) ;

                if( regexec(&preg2,yytext+6,0,0,0) == 0 )
		{
			/* The ISO case. Replace dashes by slashes
			*/
                        fprintf(yyout,"%.4s/%.2s/%.2s/ %s",yytext+6,yytext+11,yytext+14,yytext+17) ;
		}
                else if( regexec(&preg1,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 )
                {
	                /* pmat[0] all, pmat[1] the mont, pmat[2] any intermediate dots and blanks, pmat[3] the year.
			*/
			int mon = name2Month(yytext+6) ;
			int yr = atoi(yytext+6+pmat[3].rm_so) ;
			if ( mon )
	                        fprintf(yyout,"%d/%02d// ",yr,mon) ;
			else
	                        fprintf(yyout,"%d/// ",yr) ;

			/* the original pattern plus any <CR><NL> 
			*/
			fprintf(yyout,"%s",yytext+6) ;
                }
                else if( regexec(&preg3,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 )
                {
	                /* pmat[0] all, pmat[1] the mont, pmat[2] the date, pmat[3] the year.
			*/
			int mon = name2Month(yytext+6) ;
			int dat = atoi(yytext+6+pmat[2].rm_so) ;
			int yr = atoi(yytext+6+pmat[3].rm_so) ;
			if ( mon )
	                        fprintf(yyout,"%d/%02d/%02d/ ",yr,mon,dat) ;
			else
       				fprintf(yyout,"%d//%02d/ ",yr,dat) ;

			/* the original pattern plus any <CR><NL> 
			*/
			fprintf(yyout,"%s",yytext+6) ;
                }
                else if( regexec(&preg4,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 )
                {
			int yr = atoi(yytext+6) ;
			if ( pmat[1].rm_so >=0 )
			{
				int mon = atoi(yytext+6+pmat[1].rm_so+1) ;
	                        fprintf(yyout,"%d/%02d//",yr,mon) ;
			}
			else
	                        fprintf(yyout,"%d///",yr) ;

			/* the original <CR><NL> 
			*/
			fprintf(yyout,"%s",yytext+6+pmat[0].rm_eo) ;
                }
		else
			/* give up and copy through
			*/
			fprintf(yyout,"%s",yytext+6) ;

}
#undef Y1PATCH_PMAT_SIZ

%}

RISTAG	"Y1  - "|"PY  - "|"Y2  - "
DIGIT	[0-9]

	/* the scanner ought be compiled with the "-i" flag to
	* trigger also on the various upper/lowercase variants of these patterns
	*/
MONTH	"Jan"|"Feb"|"Mar"|"Apr"|"May"|"Jun"|"Jul"|"Aug"|"Sep"|"Oct"|"Nov"|"Dec"

%%

	/* Lines which look correct are copied trough as they are.
	* This means they contain no 3-letter mont acronyms as substrings.
	*/

	/* If the tag is immediately followed by one of the TLA (three-letter
	* acronyms above), or looks like an ISO date we try conversion.
	*/
{RISTAG}{MONTH}.+\n |
{RISTAG}{DIGIT}{4}[[:blank:]]*\r?\n |
{RISTAG}{DIGIT}{4}\/{DIGIT}{1,2}[[:blank:]]*\r?\n |
{RISTAG}{DIGIT}{4}-{DIGIT}{2}-{DIGIT}{2}.+\n {

		/* debugging
		* printf("%d\n",__LINE__) ;
		*/
		y1patch(yytext) ;
	}


%%
int main(int argc, char *argv[])
	{
		/* preg1: at least three characters (matching months supposedly)
		* followed by any comma, dot and white space, and the year
		*/
		int stat =regcomp(&preg1,
			"\\([[:alpha:]]\\{3,\\}\\)\\([[:space:],\\./]*\\)\\([[:digit:]]\\{4\\}\\)\\(.*\\)", REG_ICASE) ;
		if ( stat )
			fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ;

		/* preg2: the ISO YYYY-MM-DD format
		*/
		stat =regcomp(&preg2, "[[:digit:]]\\{4\\}-[[:digit:]]\\{2\\}-[[:digit:]]\\{2\\}.*", REG_ICASE) ;
		if ( stat )
			fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ;

		/* preg3: the fullmontname 1-or-2digit-date, 4-digityear
		*/
		stat =regcomp(&preg3,
			"\\([[:alpha:]]\\{3,\\}\\) \\([[:digit:]]\\{1,2\\}\\)[ ,]*\\([[:digit:]]\\{4\\}\\).*", REG_ICASE) ;
		if ( stat )
			fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ;

		/* preg4: a sole 4-digit year, optionally with a slash and 2-digit month
		*/
		stat =regcomp(&preg4, "[[:digit:]]{4}(/[[:digit:]]{1,2})?[[:blank:]]*", REG_ICASE|REG_EXTENDED) ;
		if ( stat )
			fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ;

		yylex() ;
		regfree(& preg4) ;
		regfree(& preg3) ;
		regfree(& preg2) ;
		regfree(& preg1) ;
		return 0 ;
	}