WillerZ
From XMLTV
Welcome to my user page!
[edit]
utf8fix
This is a program to replace invalid UTF-8 characters with ? characters. Sorry it's in C and not Perl. I use it because tv_grab_uk_rt often results in invalid utf-8 characters in the descriptions of shows (probably a problem with the source data). Save as utf8fix.c then type 'make utf8fix'. Once built, 'utf8fix <filename>' will replace invalid characters in <filename> in-place. It only deals with some classes of invalid character.
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
int main(int argc, char **argv)
{
int fd;
char *text;
struct stat stats;
off_t idx;
unsigned charbytes,expectbytes;
size_t lengths[6] = {};
(void)argc;
(void)argv;
fd = open(argv[1], O_RDWR | O_NOCTTY);
if (fd < 0)
exit(EXIT_FAILURE);
if (fstat(fd, &stats))
exit(EXIT_FAILURE);
text = mmap( NULL
, stats.st_size
, PROT_READ | PROT_WRITE
, MAP_SHARED
, fd
, 0
);
if (text == MAP_FAILED)
exit(EXIT_FAILURE);
close(fd);
charbytes = 0;
expectbytes = 0;
#define newchar \
do { \
if (charbytes != expectbytes) \
{ \
memset(&text[idx - charbytes],'?',charbytes); \
++lengths[0]; \
} \
else \
++lengths[expectbytes]; \
charbytes = 0; \
} while (0)
for (idx = 0; idx < stats.st_size; ++idx)
{
if ((text[idx] & 0x80) == 0x00)
{
newchar;
expectbytes = 1;
}
else if ((text[idx] & 0xC0) == 0x80)
{
/* continuation character */
}
else if ((text[idx] & 0xE0) == 0xC0)
{
newchar;
expectbytes = 2;
}
else if ((text[idx] & 0xF0) == 0xE0)
{
newchar;
expectbytes = 3;
}
else if ((text[idx] & 0xF8) == 0xF0)
{
newchar;
expectbytes = 4;
}
else if ((text[idx] & 0xFC) == 0xF8)
{
newchar;
expectbytes = 5;
}
else if ((text[idx] & 0xFE) == 0xFC)
{
newchar;
expectbytes = 6;
}
else
{
newchar;
expectbytes = 0;
}
++charbytes;
}
newchar;
munmap(text, stats.st_size);
--lengths[0];
fprintf( stderr
, "Statistics:\n"
" Single-byte characters: %8zu\n"
" Two-byte characters: %8zu\n"
" Three-byte characters: %8zu\n"
" Four-byte characters: %8zu\n"
" Five-byte characters: %8zu\n"
" Six-byte characters: %8zu\n"
" Invalid characters: %8zu\n"
, lengths[1]
, lengths[2]
, lengths[3]
, lengths[4]
, lengths[5]
, lengths[6]
, lengths[0]
);
return 0;
}
