User:WillerZ

From XMLTV
Revision as of 16:56, 2 September 2010 by Dekarl (Talk | contribs) (utf8fix: prepare for grabber articles)

(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

Welcome to my user page!

utf8fix

This is a program to replace invalid UTF-8 characters with ? characters. Sorry it's in C and not Perl. I use it because tv_grab_uk_rt often results in invalid utf-8 characters in the descriptions of shows (probably a problem with the source data). Save as utf8fix.c then type 'make utf8fix'. Once built, 'utf8fix <filename>' will replace invalid characters in <filename> in-place. It only deals with some classes of invalid character.

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>

int main(int argc, char **argv)
{
  int fd;
  char *text;
  struct stat stats;
  off_t idx;
  unsigned charbytes,expectbytes;
  size_t lengths[6] = {};
  
  (void)argc;
  (void)argv;
   
  fd = open(argv[1], O_RDWR | O_NOCTTY);
  if (fd < 0)
    exit(EXIT_FAILURE);

  if (fstat(fd, &stats))
    exit(EXIT_FAILURE);

  text = mmap( NULL
             , stats.st_size
             , PROT_READ | PROT_WRITE
             , MAP_SHARED
             , fd
             , 0
             );
  if (text == MAP_FAILED)
    exit(EXIT_FAILURE);

  close(fd);

  charbytes = 0;
  expectbytes = 0;

#define newchar                                                                \
  do {                                                                         \
    if (charbytes != expectbytes)                                              \
    {                                                                          \
      memset(&text[idx - charbytes],'?',charbytes);                            \
      ++lengths[0];                                                            \
    }                                                                          \
    else                                                                       \
      ++lengths[expectbytes];                                                  \
    charbytes = 0;                                                             \
  } while (0)
  for (idx = 0; idx < stats.st_size; ++idx)
  {
    if ((text[idx] & 0x80) == 0x00)
    {
      newchar;
      expectbytes = 1;
    }
    else if ((text[idx] & 0xC0) == 0x80)
    {
      /* continuation character */
    }
    else if ((text[idx] & 0xE0) == 0xC0)
    {
      newchar;
      expectbytes = 2;
    }
    else if ((text[idx] & 0xF0) == 0xE0)
    {
      newchar;
      expectbytes = 3;
    }
    else if ((text[idx] & 0xF8) == 0xF0)
    {
      newchar;
      expectbytes = 4;
    }
    else if ((text[idx] & 0xFC) == 0xF8)
    {
      newchar;
      expectbytes = 5;
    }
    else if ((text[idx] & 0xFE) == 0xFC)
    {
      newchar;
      expectbytes = 6;
    }
    else
    {
      newchar;
      expectbytes = 0;
    }
    ++charbytes;
  }
  newchar;
  munmap(text, stats.st_size);
  --lengths[0];
  fprintf( stderr
         , "Statistics:\n"
           "  Single-byte characters: %8zu\n"
           "  Two-byte characters:    %8zu\n"
           "  Three-byte characters:  %8zu\n"
           "  Four-byte characters:   %8zu\n"
           "  Five-byte characters:   %8zu\n"
           "  Six-byte characters:    %8zu\n"
           "  Invalid characters:     %8zu\n"
         , lengths[1]
         , lengths[2]
         , lengths[3]
         , lengths[4]
         , lengths[5]
         , lengths[6]
         , lengths[0]
         );
  return 0;
}