/*
 * file:  utf8.l
 *
 * required flex option:
 *
 *  -8 (generate 8bit scanner)
 */

%{

#ifdef __MSDOS__
#  include <dir.h>
#  include <fcntl.h>
#  include <io.h>
#else
#  include <unistd.h>
#endif
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>

int
    printcode = 0;

char
    *programname;

void
    get_programname (char const *argv0),
    syntax (void),
    errit (char const *format, ...),
    bytes2 (void),
    bytes3 (void),
    bytes4 (void),
    bytes5 (void),
    bytes6 (void),
    outchar (long unsigned);

#define YY_NO_UNPUT
#define YY_SKIP_YYWRAP
#ifdef yywrap
#  undef yywrap
#endif
int yywrap()
{
    return 1;
}

%}

%%

[\300-\337].			{ bytes2 (); }
[\340-\357]..                   { bytes3 (); }
[\360-\367]...                  { bytes4 (); }
[\370-\373]....			{ bytes5 (); }
[\374-\375].....		{ bytes6 (); }

%%

void bytes2 ()
{
    unsigned
        u [2],
        c;
    int
        i;

    for (i = 0; i < 2; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [1] & 0x3F)
        | ((u [0] & 0x1F) << 6);

    outchar (c);
}


void bytes3 ()
{
    unsigned
        u [3],
        c;
    int
        i;

    for (i = 0; i < 3; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [2] & 0x3F)
        | ((u [1] & 0x3F) <<  6)
        | ((u [0] & 0x0F) << 12);

    outchar (c);
}

void bytes4 ()
{
    long unsigned
        u [4],
        c;
    int
        i;

    for (i = 0; i < 4; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [3] & 0x3F)
        | ((u [2] & 0x3F) <<  6)
        | ((u [1] & 0x3F) << 12)
        | ((u [0] & 0x07) << 18);

    outchar (c);
}

void bytes5 ()
{
    long unsigned
        u [5],
        c;
    int
        i;

    for (i = 0; i < 5; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [4] & 0x3F)
        | ((u [3] & 0x3F) <<  6)
        | ((u [2] & 0x3F) << 12)
        | ((u [1] & 0x3F) << 18)
        | ((u [0] & 0x03) << 24);

    outchar (c);
}

void bytes6 ()
{
    long unsigned
        u [6],
        c;
    int
        i;

    for (i = 0; i < 6; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [5] & 0x3F)
        | ((u [4] & 0x3F) <<  6)
        | ((u [3] & 0x3F) << 12)
        | ((u [2] & 0x3F) << 18)
        | ((u [1] & 0x3F) << 24)
        | ((u [0] & 0x01) << 30);

    outchar (c);
}

void outchar (long unsigned c)
{
    int
        i;
    char
        *s;

    /*
     * iso-8859-1
     */
    if (c < 256) {
        fputc (c, stdout);
        return;
    }

    /*
     * iso-8859-15
     */
    i = 0;
    switch (c) {
        case 0x20Ac: i = 0xA4; break;    /* euro */
        case 0x0160: i = 0xA6; break;    /* S caron */
        case 0x0161: i = 0xA8; break; 	 /* s caron */
        case 0x017D: i = 0xB4; break;    /* Z caron */
        case 0x017E: i = 0xB8; break;    /* z caron */
        case 0x0152: i = 0xBC; break;    /* OE ligature */
        case 0x0153: i = 0xBD; break;    /* oe ligature */
        case 0x0178: i = 0xBE; break;    /* Y diaeresis */
    }
    if (i) {
        fputc (i, stdout);
        return;
    }

    /*
     * substitutions
     */
    s = NULL;
    switch (c) {
        case 0x0132: s = "IJ"; break;
        case 0x0133: s = "ij"; break;
    }
    if (s) {
        fputs (s, stdout);
        return;
    }

    if (printcode) {
        if (c < 0x10000)
            printf ("U+%04X", (unsigned) c);
        else
            printf ("U+%08lX", c);
    }  else
        fputc (191, stdout);
}

int main (int argc, char *argv [])
{
    get_programname (argv [0]);

    while (argc > 1)
        if (! strcmp (argv [1], "-c")) {
            printcode = 1;
            argv++;
            argc--;
	} else
            break;

    switch (argc) {
        case 1:
            if (isatty (fileno (stdin)))
                syntax ();
            yyin = stdin;
            break;
        case 2:
            yyin = fopen (argv [1], "r");
            if (! yyin)
                errit ("Opening file \"%s\": %s", argv [1], strerror (errno));
            break;
        default:
            syntax ();
    }

    yylex ();

    if (yyin != stdin)
        fclose (yyin);

    return 0;
}

void get_programname (char const *argv0)
{
#ifdef __MSDOS__
    char
        name [MAXFILE];
    fnsplit (argv0, NULL, NULL, name, NULL);
    programname = strdup (name);
#else   /* unix */
    char
        *p;
    p = strrchr (argv0, '/');
    if (p)
        programname = strdup (p + 1);
    else
        programname = strdup (argv0);
#endif    
}

void errit (char const *format, ...)
{
    va_list
        list;

    fprintf (stderr, "\nError %s: ", programname);

    va_start (list, format);
    vfprintf (stderr, format, list);

    fprintf (stderr, "\n\n");

    exit (1);
}

void syntax ()
{
    fprintf (
        stderr,
        "\n"
        "Syntax: %s [-c] [utf-8 encoded file]\n"
        "\n"
        "The file will be translated to iso-8859-1 *and* iso-8859-15\n"
	"\n"
	"  -c : print U+code for characters not in iso-8859-1/15\n"
        "\n",
        programname
    );

    exit (1);
}
