User:WillerZ
From XMLTV
(Redirected from WillerZ)
Welcome to my user page!
utf8fix
This is a program to replace invalid UTF-8 characters with ? characters. Sorry it's in C and not Perl. I use it because tv_grab_uk_rt often results in invalid utf-8 characters in the descriptions of shows (probably a problem with the source data). Save as utf8fix.c then type 'make utf8fix'. Once built, 'utf8fix <filename>' will replace invalid characters in <filename> in-place. It only deals with some classes of invalid character.
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <sys/mman.h> #include <stdlib.h> #include <stddef.h> #include <stdio.h> #include <string.h> #include <inttypes.h> int main(int argc, char **argv) { int fd; char *text; struct stat stats; off_t idx; unsigned charbytes,expectbytes; size_t lengths[6] = {}; (void)argc; (void)argv; fd = open(argv[1], O_RDWR | O_NOCTTY); if (fd < 0) exit(EXIT_FAILURE); if (fstat(fd, &stats)) exit(EXIT_FAILURE); text = mmap( NULL , stats.st_size , PROT_READ | PROT_WRITE , MAP_SHARED , fd , 0 ); if (text == MAP_FAILED) exit(EXIT_FAILURE); close(fd); charbytes = 0; expectbytes = 0; #define newchar \ do { \ if (charbytes != expectbytes) \ { \ memset(&text[idx - charbytes],'?',charbytes); \ ++lengths[0]; \ } \ else \ ++lengths[expectbytes]; \ charbytes = 0; \ } while (0) for (idx = 0; idx < stats.st_size; ++idx) { if ((text[idx] & 0x80) == 0x00) { newchar; expectbytes = 1; } else if ((text[idx] & 0xC0) == 0x80) { /* continuation character */ } else if ((text[idx] & 0xE0) == 0xC0) { newchar; expectbytes = 2; } else if ((text[idx] & 0xF0) == 0xE0) { newchar; expectbytes = 3; } else if ((text[idx] & 0xF8) == 0xF0) { newchar; expectbytes = 4; } else if ((text[idx] & 0xFC) == 0xF8) { newchar; expectbytes = 5; } else if ((text[idx] & 0xFE) == 0xFC) { newchar; expectbytes = 6; } else { newchar; expectbytes = 0; } ++charbytes; } newchar; munmap(text, stats.st_size); --lengths[0]; fprintf( stderr , "Statistics:\n" " Single-byte characters: %8zu\n" " Two-byte characters: %8zu\n" " Three-byte characters: %8zu\n" " Four-byte characters: %8zu\n" " Five-byte characters: %8zu\n" " Six-byte characters: %8zu\n" " Invalid characters: %8zu\n" , lengths[1] , lengths[2] , lengths[3] , lengths[4] , lengths[5] , lengths[6] , lengths[0] ); return 0; }