# utf8_fixups
#
# $Id: utf8_fixups,v 1.69 2010/08/01 01:51:11 knowledgejunkie Exp $
#
# This file contains fixups for mis-encoded UTF-8 characters that are 
# frequently seen in the source data from the Radio Times.
#
# When the grabber is run with --debug, a summary of listings files processed
# containing unhandled mis-encoded characters is created. In order to create a
# fixup, download the source data file from the Radio Times and examine the
# raw bytes to determine i) what the regex should search for, and ii) what the
# replacement characters should be.
#
# The file is split into three sections:
#
# 1) Mis-encoded single characters represented with [EF][BF][BD] bytes
# 2) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
# 3) Mis-encoded single characters in range [C2][80-9F]
#
# Each entry comprises two pipe-separated fields:
#
#  i) the hex chars of the mis-encoded character(s), and
# ii) the hex chars of the replacement character(s) to substitute
#
# A useful reference for the correct UTF-8 encodings of all Unicode characters
# is at http://www.eki.ee/letter/
#
################################################################################
#
#
# 1) Characters represented with [EF][BF][BD] bytes - these fixups require
# more context bytes to be given in order to know the intended replacement
# character as the [EF][BF][BD] bytes can represent *any* character.
#
# These fixups are mostly required for "The Community Channel" listings which
# have mis-encoded apostrophe/quotation characters.
#
# "1970's"
\x31\x39\x37\x30\xEF\xBF\xBD\x73|\x31\x39\x37\x30\x27\x73
# "4'11"
\x34\xEF\xBF\xBD\x31\x31|\x34\x27\x31\x31
# "Alzheimer's"
\x41\x6C\x7A\x68\x65\x69\x6D\x65\x72\xEF\xBF\xBD\x73|\x41\x6C\x7A\x68\x65\x69\x6D\x65\x72\x27\x73
# "Bolivia's"
\x42\x6F\x6C\x69\x76\x69\x61\xEF\xBF\xBD\x73|\x42\x6F\x6C\x69\x76\x69\x61\x27\x73
# "Brazil's"
\x42\x72\x61\x7A\x69\x6C\xEF\xBF\xBD\x73|\x42\x72\x61\x7A\x69\x6C\x27\x73
# "California's"
\x43\x61\x6C\x69\x66\x6F\x72\x6E\x69\x61\xEF\xBF\xBD\x73|\x43\x61\x6C\x69\x66\x6F\x72\x6E\x69\x61\x27\x73
# "Calloo's"
\x43\x61\x6C\x6C\x6F\x6F\xEF\xBF\xBD\x73|\x43\x61\x6C\x6C\x6F\x6F\x27\x73
# "Europe's"
\x45\x75\x72\x6F\x70\x65\xEF\xBF\xBD\x73|\x45\x75\x72\x6F\x70\x65\x27\x73
# "Let's"
\x4C\x65\x74\xEF\xBF\xBD\x73|\x4C\x65\x74\x27\x73
# "Luke's"
\x4C\x75\x6B\x65\xEF\xBF\xBD\x73|\x4C\x75\x6B\x65\x27\x73
# "Naomi's"
\x4E\x61\x6F\x6D\x69\xEF\xBF\xBD\x73|\x4E\x61\x6F\x6D\x69\x27\x73
# "Paris'"
\x50\x61\x72\x69\x73\xEF\xBF\xBD|\x50\x61\x72\x69\x73\x27
# "Parkinson's"
\x50\x61\x72\x6B\x69\x6E\x73\x6F\x6E\xEF\xBF\xBD\x73|\x50\x61\x72\x6B\x69\x6E\x73\x6F\x6E\x27\x73
# "Regent's"
\x52\x65\x67\x65\x6E\x74\xEF\xBF\xBD\x73|\x52\x65\x67\x65\x6E\x74\x27\x73
# "Sam's"
\x53\x61\x6D\xEF\xBF\xBD\x73|\x53\x61\x6D\x27\x73
# "Schwarzenegger's"
\x53\x63\x68\x77\x61\x72\x7A\x65\x6E\x65\x67\x67\x65\x72\xEF\xBF\xBD\x73|\x53\x63\x68\x77\x61\x72\x7A\x65\x6E\x65\x67\x67\x65\x72\x27\x73
# "UK's"
\x55\x4B\xEF\xBF\xBD\x73|\x55\x4B\x27\x73
# "Women's"
\x57\x6F\x6D\x65\x6E\xEF\xBF\xBD\x73|\x57\x6F\x6D\x65\x6E\x27\x73
# "can't"
\x63\x61\x6E\xEF\xBF\xBD\x74|\x63\x61\x6E\x27\x74
# "community's"
\x63\x6F\x6D\x6D\x75\x6E\x69\x74\x79\xEF\xBF\xBD\x73|\x63\x6F\x6D\x6D\x75\x6E\x69\x74\x79\x27\x73
# "countryside's"
\x63\x6F\x75\x6E\x74\x72\x79\x73\x69\x64\x65\xEF\xBF\xBD\x73|\x63\x6F\x75\x6E\x74\x72\x79\x73\x69\x64\x65\x27\x73
# "country's"
\x63\x6F\x75\x6E\x74\x72\x79\xEF\xBF\xBD\x73|\x63\x6F\x75\x6E\x74\x72\x79\x27\x73
# "doesn't"
\x64\x6F\x65\x73\x6E\xEF\xBF\xBD\x74|\x64\x6F\x65\x73\x6E\x27\x74
# "he's"
\x68\x65\xEF\xBF\xBD\x73|\x68\x65\x27\x73
# "industry's"
\x69\x6E\x64\x75\x73\x74\x72\x79\xEF\xBF\xBD\x73|\x69\x6E\x64\x75\x73\x74\x72\x79\x27\x73
# "isn't"
\x69\x73\x6E\xEF\xBF\xBD\x74|\x69\x73\x6E\x27\x74
# "surfer's"
\x73\x75\x72\x66\x65\x72\xEF\xBF\xBD\x73|\x73\x75\x72\x66\x65\x72\x27\x73
# "they're"
\x74\x68\x65\x79\xEF\xBF\xBD\x72\x65|\x74\x68\x65\x79\x27\x72\x65
# "week's"
\x77\x65\x65\x6B\xEF\xBF\xBD\x73|\x77\x65\x65\x6B\x27\x73
# "woman's"
\x77\x6F\x6D\x61\x6E\xEF\xBF\xBD\x73|\x77\x6F\x6D\x61\x6E\x27\x73
# "women's"
\x77\x6F\x6D\x65\x6E\xEF\xBF\xBD\x73|\x77\x6F\x6D\x65\x6E\x27\x73
# "world's"
\x77\x6F\x72\x6C\x64\xEF\xBF\xBD\x73|\x77\x6F\x72\x6C\x64\x27\x73
# " "All Day" "
\xEF\xBF\xBD\x41\x6C\x6C\x20\x44\x61\x79\xEF\xBF\xBD|\x22\x41\x6C\x6C\x20\x44\x61\x79\x22
# " "Arc of Fire" "
\xEF\xBF\xBD\x41\x72\x63\x20\x6F\x66\x20\x46\x69\x72\x65\xEF\xBF\xBD|\x22\x41\x72\x63\x20\x6F\x66\x20\x46\x69\x72\x65\x22
# " "Deafhood" "
\xEF\xBF\xBD\x44\x65\x61\x66\x68\x6F\x6F\x64\xEF\xBF\xBD|\x22\x44\x65\x61\x66\x68\x6F\x6F\x64\x22
# " "Game of Life" "
\xEF\xBF\xBD\x47\x61\x6D\x65\x20\x6F\x66\x20\x4C\x69\x66\x65\xEF\xBF\xBD|\x22\x47\x61\x6D\x65\x20\x6F\x66\x20\x4C\x69\x66\x65\x22
# " "Han: The Price of Freedom" "
\xEF\xBF\xBD\x48\x61\x6E\x3A\x20\x54\x68\x65\x20\x50\x72\x69\x63\x65\x20\x6F\x66\x20\x46\x72\x65\x65\x64\x6F\x6D\xEF\xBF\xBD|\x22\x48\x61\x6E\x3A\x20\x54\x68\x65\x20\x50\x72\x69\x63\x65\x20\x6F\x66\x20\x46\x72\x65\x65\x64\x6F\x6D\x22
# " "Save the Forests of Jharkhand" "
\xEF\xBF\xBD\x53\x61\x76\x65\x20\x74\x68\x65\x20\x46\x6F\x72\x65\x73\x74\x73\x20\x6F\x66\x20\x4A\x68\x61\x72\x6B\x68\x61\x6E\x64\xEF\xBF\xBD|\x22\x53\x61\x76\x65\x20\x74\x68\x65\x20\x46\x6F\x72\x65\x73\x74\x73\x20\x6F\x66\x20\x4A\x68\x61\x72\x6B\x68\x61\x6E\x64\x22
# " "Snapshot" "
\xEF\xBF\xBD\x53\x6E\x61\x70\x73\x68\x6F\x74\xEF\xBF\xBD|\x22\x53\x6E\x61\x70\x73\x68\x6F\x74\x22
# " "Torang" "
\xEF\xBF\xBD\x54\x6F\x72\x61\x6E\x67\xEF\xBF\xBD|\x22\x54\x6F\x72\x61\x6E\x67\x22
# " "Your Game" "
\xEF\xBF\xBD\x59\x6F\x75\x72\x20\x47\x61\x6D\x65\xEF\xBF\xBD|\x22\x59\x6F\x75\x72\x20\x47\x61\x6D\x65\x22
# " "sustainable development" "
\xEF\xBF\xBD\x73\x75\x73\x74\x61\x69\x6E\x61\x62\x6C\x65\x20\x64\x65\x76\x65\x6C\x6F\x70\x6D\x65\x6E\x74\xEF\xBF\xBD|\x22\x73\x75\x73\x74\x61\x69\x6E\x61\x62\x6C\x65\x20\x64\x65\x76\x65\x6C\x6F\x70\x6D\x65\x6E\x74\x22
# " "unwritten" "
\xEF\xBF\xBD\x75\x6E\x77\x72\x69\x74\x74\x65\x6E\xEF\xBF\xBD|\x22\x75\x6E\x77\x72\x69\x74\x74\x65\x6E\x22
#
#
# 2) Characters represented with doubly-encoded [EF][BF][BD] bytes which are
# seen in the raw data as bytes [C3][AF][C2][BF][C2][BD] - these fixups require
# more context bytes to be given in order to know the intended replacement
# character as the [C3][AF][C2][BF][C2][BD] bytes can represent *any* character
#
#
# "Alien^3"
\x41\x6C\x69\x65\x6E\xC3\xAF\xC2\xBF\xC2\xBD|\x41\x6C\x69\x65\x6E\xC2\xB3
# "Armendáriz"
\x41\x72\x6D\x65\x6E\x64\xC3\xAF\xC2\xBF\xC2\xBD\x72\x69\x7A|\x41\x72\x6D\x65\x6E\x64\xC3\xA1\x72\x69\x7A
# "André"
\x41\x6E\x64\x72\xC3\xAF\xC2\xBF\xC2\xBD|\x41\x6E\x64\x72\xC3\xA9
# "Bankolé"
\x42\x61\x6E\x6B\x6F\x6C\xC3\xAF\xC2\xBF\xC2\xBD|\x42\x61\x6E\x6B\x6F\x6C\xC3\xA9
# "Berléand"
\x42\x65\x72\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x61\x6E\x64|\x42\x65\x72\x6C\xC3\xA9\x61\x6E\x64
# "Bonnaffé"
\x42\x6F\x6E\x6E\x61\x66\x66\xC3\xAF\xC2\xBF\xC2\xBD|\x42\x6F\x6E\x6E\x61\x66\x66\xC3\xA9
# "Bublé"
\x42\x75\x62\x6C\xC3\xAF\xC2\xBF\xC2\xBD|\x42\x75\x62\x6C\xC3\xA9
# "Carré"
\x43\x61\x72\x72\xC3\xAF\xC2\xBF\xC2\xBD|\x43\x61\x72\x72\xC3\xA9
# "César"
\x43\xC3\xAF\xC2\xBF\xC2\xBD\x73\x61\x72|\x43\xC3\xA9\x73\x61\x72
# "Chéreau"
\x43\x68\xC3\xAF\xC2\xBF\xC2\xBD\x72\x65\x61\x75|\x43\x68\xC3\xA9\x72\x65\x61\x75
# "Cuarón"
\x43\x75\x61\x72\xC3\xAF\xC2\xBF\xC2\xBD\x6E|\x43\x75\x61\x72\xC3\xB3\x6E
# "Déborah"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x62\x6F\x72\x61\x68|\x44\xC3\xA9\x62\x6F\x72\x61\x68
# "Dédé"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x64\xC3\xAF\xC2\xBF\xC2\xBD|\x44\xC3\xA9\x64\xC3\xA9
# "Déjà"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x6A\xC3\xAF\xC2\xBF\xC2\xBD|\x44\xC3\xA9\x6A\xC3\xA0
# "Díaz"
\x44\xC3\xAF\xC2\xBF\xC2\xBD\x61\x7A|\x44\xC3\xAD\x61\x7A
# "Echevarría"
\x45\x63\x68\x65\x76\x61\x72\x72\xC3\xAF\xC2\xBF\xC2\xBD\x61|\x45\x63\x68\x65\x76\x61\x72\x72\xC3\xAD\x61
# "Fouchécourt"
\x46\x6F\x75\x63\x68\xC3\xAF\xC2\xBF\xC2\xBD\x63\x6F\x75\x72\x74|\x46\x6F\x75\x63\x68\xC3\xA9\x63\x6F\x75\x72\x74
# "François"
\x46\x72\x61\x6E\xC3\xAF\xC2\xBF\xC2\xBD\x6F\x69\x73|\x46\x72\x61\x6E\xC3\xA7\x6F\x69\x73
# "Gérard"
\x47\xC3\xAF\xC2\xBF\xC2\xBD\x72\x61\x72\x64|\x47\xC3\xA9\x72\x61\x72\x64
# "Grégoire"
\x47\x72\xC3\xAF\xC2\xBF\xC2\xBD\x67\x6F\x69\x72\x65|\x47\x72\xC3\xA9\x67\x6F\x69\x72\x65
# "Hallström"
\x48\x61\x6C\x6C\x73\x74\x72\xC3\xAF\xC2\xBF\xC2\xBD\x6D|\x48\x61\x6C\x6C\x73\x74\x72\xC3\xB6\x6D
# "Hélène"
\x48\xC3\xAF\xC2\xBF\xC2\xBD\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x6E\x65|\x48\xC3\xA9\x6C\xC3\xA8\x6E\x65
# "Hofstätter"
\x48\x6F\x66\x73\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x74\x65\x72|\x48\x6F\x66\x73\x74\xC3\xA4\x74\x74\x65\x72
# "Hübner"
\x48\xC3\xAF\xC2\xBF\xC2\xBD\x62\x6E\x65\x72|\x48\xC3\xBC\x62\x6E\x65\x72
# "Joffé"
\x4A\x6F\x66\x66\xC3\xAF\xC2\xBF\xC2\xBD|\x4A\x6F\x66\x66\xC3\xA9
# "José"
\x4A\x6F\x73\xC3\xAF\xC2\xBF\xC2\xBD|\x4A\x6F\x73\xC3\xA9
# "Jürgen"
\x4A\xC3\xAF\xC2\xBF\xC2\xBD\x72\x67\x65\x6E|\x4A\xC3\xBC\x72\x67\x65\x6E
# "Koundé"
\x4B\x6F\x75\x6E\x64\xC3\xAF\xC2\xBF\xC2\xBD|\x4B\x6F\x75\x6E\x64\xC3\xA9
# "Krabbé"
\x4B\x72\x61\x62\x62\xC3\xAF\xC2\xBF\xC2\xBD|\x4B\x72\x61\x62\x62\xC3\xA9
# "Lázaro"
\x4C\xC3\xAF\xC2\xBF\xC2\xBD\x7A\x61\x72\x6F|\x4C\xC3\xA1\x7A\x61\x72\x6F
# "Léonide"
\x4C\xC3\xAF\xC2\xBF\xC2\xBD\x6F\x6E\x69\x64\x65|\x4C\xC3\xA9\x6F\x6E\x69\x64\x65
# "López"
\x4C\xC3\xAF\xC2\xBF\xC2\xBD\x70\x65\x7A|\x4C\xC3\xB3\x70\x65\x7A
# "Mamá También"
\x4D\x61\x6D\xC3\xAF\xC2\xBF\xC2\xBD\x20\x54\x61\x6D\x62\x69\xC3\xAF\xC2\xBF\xC2\xBD\x6E|\x4D\x61\x6D\xC3\xA1\x20\x54\x61\x6D\x62\x69\xC3\xA9\x6E
# "Médici"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x64\x69\x63\x69|\x4D\xC3\xA9\x64\x69\x63\x69
# "Mélanie"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x6C\x61\x6E\x69\x65|\x4D\xC3\xA9\x6C\x61\x6E\x69\x65
# "Môle"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x6C\x65|\x4D\xC3\xB4\x6C\x65
# "Mühe"
\x4D\xC3\xAF\xC2\xBF\xC2\xBD\x68\x65|\x4D\xC3\xBC\x68\x65
# "Paré"
\x50\x61\x72\xC3\xAF\xC2\xBF\xC2\xBD|\x50\x61\x72\xC3\xA9
# "Pelé"
\x50\x65\x6C\xC3\xAF\xC2\xBF\xC2\xBD|\x50\x65\x6C\xC3\xA9
# "Peña"
\x50\x65\xC3\xAF\xC2\xBF\xC2\xBD\x61|\x50\x65\xC3\xB1\x61
# "Penélope"
\x50\x65\x6E\xC3\xAF\xC2\xBF\xC2\xBD\x6C\x6F\x70\x65|\x50\x65\x6E\xC3\xA9\x6C\x6F\x70\x65
# "Rarámuri"
\x52\x61\x72\xC3\xAF\xC2\xBF\xC2\xBD\x6D\x75\x72\x69|\x52\x61\x72\xC3\xA1\x6D\x75\x72\x69
# "Roëves"
\x52\x6F\xC3\xAF\xC2\xBF\xC2\xBD\x76\x65\x73|\x52\x6F\xC3\xAB\x76\x65\x73
# "Rubén"
\x52\x75\x62\xC3\xAF\xC2\xBF\xC2\xBD\x6E|\x52\x75\x62\xC3\xA9\x6E
# "Skarsgård"
\x53\x6B\x61\x72\x73\x67\xC3\xAF\xC2\xBF\xC2\xBD\x72\x64|\x53\x6B\x61\x72\x73\x67\xC3\xA5\x72\x64
# "Sméagol"
\x53\x6D\xC3\xAF\xC2\xBF\xC2\xBD\x61\x67\x6F\x6C|\x53\x6D\xC3\xA9\x61\x67\x6F\x6C
# "Stéphane"
\x53\x74\xC3\xAF\xC2\xBF\xC2\xBD\x70\x68\x61\x6E\x65|\x53\x74\xC3\xA9\x70\x68\x61\x6E\x65
# "Süskind"
\x53\xC3\xAF\xC2\xBF\xC2\xBD\x73\x6B\x69\x6E\x64|\x53\xC3\xBC\x73\x6B\x69\x6E\x64
# "Théoden"
\x54\x68\xC3\xAF\xC2\xBF\xC2\xBD\x6F\x64\x65\x6E|\x54\x68\xC3\xA9\x6F\x64\x65\x6E
# "Undómiel"
\x55\x6E\x64\xC3\xAF\xC2\xBF\xC2\xBD\x6D\x69\x65\x6C|\x55\x6E\x64\xC3\xB3\x6D\x69\x65\x6C
# "Verdú"
\x56\x65\x72\x64\xC3\xAF\xC2\xBF\xC2\xBD|\x56\x65\x72\x64\xC3\xBA
# "à la "
\xC3\xAF\xC2\xBF\xC2\xBD\x20\x6C\x61\x20|\xC3\xA0\x20\x6C\x61\x20
# "cliché"
\x63\x6C\x69\x63\x68\xC3\xAF\xC2\xBF\xC2\xBD|\x63\x6C\x69\x63\x68\xC3\xA9
# "exposé"
\x65\x78\x70\x6F\x73\xC3\xAF\xC2\xBF\xC2\xBD|\x65\x78\x70\x6F\x73\xC3\xA9
# "façade"
\x66\x61\xC3\xAF\xC2\xBF\xC2\xBD\x61\x64\x65|\x66\x61\xC3\xA7\\x61\x64\x65
# "fiancé"
\x66\x69\x61\x6E\x63\xC3\xAF\xC2\xBF\xC2\xBD|\x66\x69\x61\x6E\x63\xC3\xA9
# "mêlée"
\x6D\xC3\xAF\xC2\xBF\xC2\xBD\x6C\xC3\xAF\xC2\xBF\xC2\xBD\x65|\x6D\xC3\xAA\x6C\xC3\xA9\x65
# "née"
\x6E\xC3\xAF\xC2\xBF\xC2\xBD\x65|\x6E\xC3\xA9\x65
# "tête-à-tête"
\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x65\x2D\xC3\xAF\xC2\xBF\xC2\xBD\x2D\x74\xC3\xAF\xC2\xBF\xC2\xBD\x74\x65|\x74\xC3\xAA\x74\x65\x2D\xC3\xA0\x2D\x74\xC3\xAA\x74\x65
# "vérité"
\x76\x65\x72\x69\x74\xC3\xAF\xC2\xBF\xC2\xBD|\x76\xC3\xA9\x72\x69\x74\xC3\xA9
# "vérité"
\x76\xC3\xAF\xC2\xBF\xC2\xBD\x72\x69\x74\xC3\xAF\xC2\xBF\xC2\xBD|\x76\xC3\xA9\x72\x69\x74\xC3\xA9
#
#
# 3) UTF-8 characters seen in the raw data as bytes in the range [C2][80-9F]
# which should represent a single printing character but UTF-8 decode as 
# non-printing control characters.
#
#
# "'"
\xC2\x91|\x27
# "'"
\xC2\x92|\x27
