#!/bin/sh
#
# The following code is based on the work of Clark Cooper
# and Larry Wall in Expat.xs. The original C function
# "XML_LoadEncoding" was reimplemented in GAWK.
#
# This code assumes
#   sizeof(char)  = 1
#   sizeof(short) = 2
#   sizeof(int)   = 4
# sizes of signed and unsigned types are identical
#
# Stefan Tramm / March 2005

# AWK must be defined in the environment
echo "$0: AWK=$AWK"
$AWK 'BEGIN {exit 0}' || {
   echo "Error: AWK executable must be defined in the environment."
   exit 1
}

echo "/* generated code -- do not modify manually */" > xml_enc_tables.inc

(
  echo "/* generated code -- do not modify manually */"
  echo "struct encoding_registry_t { const char *name; Encinfo *enc; } encs[]={"
) > xml_enc_registry.inc

# The encodings are stored in the source directory.
for i in `dirname $0`/encodings/*.enc
do
  od -vb $i | $AWK -v FILE=$i '
function rdb(offs) {
  if (offs >= max) { print "INTERNAL ERROR; exit 9" > "/dev/stderr"; exit 9 }
  return m[offs]
}
function rdw(offs) {
  return rdb(offs) * 256 + rdb(offs+1)
}
function rdl(offs) {
  return rdw(offs) * 65536 + rdw(offs+2)
}
 function mystrtonum(str,        ret, chars, n, i, k, c)
     {
         if (str ~ /^0[0-7]*$/) {
             # octal
             n = length(str)
             ret = 0
             for (i = 1; i <= n; i++) {
                 c = substr(str, i, 1)
                 if ((k = index("01234567", c)) > 0)
                     k-- # adjust for 1-basing in awk
     
                 ret = ret * 8 + k
             }
         } else if (str ~ /^0[xX][0-9a-fA-f]+/) {
             # hexadecimal
             str = substr(str, 3)    # lop off leading 0x
             n = length(str)
             ret = 0
             for (i = 1; i <= n; i++) {
                 c = substr(str, i, 1)
                 c = tolower(c)
                 if ((k = index("0123456789", c)) > 0)
                     k-- # adjust for 1-basing in awk
                 else if ((k = index("abcdef", c)) > 0)
                     k += 9
     
                 ret = ret * 16 + k
             }
         } else if (str ~ /^[-+]?([0-9]+([.][0-9]*([Ee][0-9]+)?)?|([.][0-9]+([Ee][-+]?[0-9]+)?))$/) {
             # decimal number, possibly floating point
             ret = str + 0
         } else
             ret = "NOT-A-NUMBER"
     
         return ret
     }
BEGIN { max = 0; OFMT="%11d" }
      { for (i = 2; i <= NF; i++) { m[max++] = mystrtonum("0" $i) } }
END   {
  # check the magic
  offs=0
  #                0xfeeb                    0xface
  if ((rdw(offs) != 65259) || (rdw(offs+2) != 64206)) {
    print FILE, "is not a proper encoding file" > "/dev/stderr"
    exit 1
  }
  offs+=4
  # fetch encoding name
  name=""
  for (i = 0; i < 40; i++) {
    if (rdb(offs+i) == 0) break
    name = name sprintf("%c", rdb(offs+i))
  }
  offs+=40
  # fetch the numbers
  pfsize = rdw(offs)
  offs+=2
  bmsize = rdw(offs)
  offs+=2
  # fetch the firstmap
  for (i = 0; i < 256; i++) firstmap[i] = rdl(offs+i*4)
  offs+=256*4
  # fetch prefixes
  sizeof_pf = 4 + 32 + 32
  bmoffs = offs + pfsize * sizeof_pf
  for (p = 0; p < pfsize; p++) {
     pmap[p, "min"] = rdb(offs)
     pmap[p, "len"] = rdb(offs+1)
     pmap[p, "bmstart"] = rdw(offs+2)
     for (i = 0; i < 32; i++) pmap[p, "ispfx", i]  = rdb(offs+4+i)
     for (i = 0; i < 32; i++) pmap[p, "ischar", i] = rdb(offs+4+32+i)
     offs += sizeof_pf
  }
  if (offs != bmoffs) { print "INTERNAL ERROR; exit 9" > "/dev/stderr"; exit 9 }
  # fetch bytemap
  for (i = 0; i < bmsize; i++) bm[i] = rdw(offs + i * 2)
  #############################################################
  # generate output
  print "/* encoding for", name, "found in file", FILE, "*/"
  cname=tolower(name)
  gsub(/-/, "_", cname)
  # print the bytemap
  print "unsigned short " cname "_bm[" bmsize "]={"
  for (i = 0; i < bmsize; i++) printf("  0x%04x%s\n", bm[i], i<bmsize-1 ? "," : "")
  print "};"
  # print the prefixes
  print "PrefixMap " cname "_pm[" pfsize "]={"
  for (p = 0; p < pfsize; p++) {
    printf("  { 0x%02x, 0x%02x, 0x%04x, {", pmap[p, "min"], pmap[p, "len"], pmap[p, "bmstart"])
    for (i = 0; i < 32; i++) { printf(" 0x%02x%s", pmap[p, "ispfx", i], i<31 ? "," : "") }
    printf(" }, {")
    for (i = 0; i < 32; i++) { printf(" 0x%02x%s", pmap[p, "ischar", i], i<31 ? "," : "") }
    print " } }" (p<pfsize-1 ? "," : "")
  }
  print "};"
  # and now print the Encinfo
  print "Encinfo " cname "_enc={"
  printf("  %s, %s, {\n", pfsize, bmsize)
  for (i = 0; i < 256; i++) printf("    0x%04x%04x%s\n", firstmap[i]/65536, (firstmap[i] % 65536), i<255 ? "," : "")
  printf("  }, %s, %s", cname "_pm", cname "_bm")
  print "};"
  # register the encoding
  printf("  {\"%s\", &%s},\n", toupper(name), cname "_enc") >> "xml_enc_registry.inc"
  # register some often used aliases
  if (toupper(name) == "X-EUC-JP-UNICODE")
    printf("  {\"%s\", &%s},\n", "EUC-JP", cname "_enc") >> "xml_enc_registry.inc"
  if (toupper(name) == "X-SJIS-CP932")
    printf("  {\"%s\", &%s},\n", "SHIFT_JIS", cname "_enc") >> "xml_enc_registry.inc"

}
'
done > xml_enc_tables.inc
(
  echo "  {0, 0}"
  echo "};"
) >> xml_enc_registry.inc

# vim: set ft=awk :
