diff --git a/whipper/image/toc.py b/whipper/image/toc.py index b4c7656..d4efb96 100644 --- a/whipper/image/toc.py +++ b/whipper/image/toc.py @@ -33,7 +33,61 @@ import logging logger = logging.getLogger(__name__) # shared -_CDTEXT_CANDIDATE_RE = re.compile(r'(?P\w+) "(?P.+)"') +_CDTEXT_CANDIDATE_RE = re.compile(r''' + (?P\w+) # CD-TEXT key. + \s+ + "(?P # CD-TEXT value. + (?: + \\\\ # escaped backslash. + | \\" # escaped double-quote. + | [^"] # not a double-quote. + )+? # the value must not be empty. + )" +''', flags=re.VERBOSE) + +_STRING_SUBSTITUTIONS_RE = re.compile(r''' + \\(?P[0-8][0-8][0-8]) + | \\" + | \\\\ +''', flags=re.VERBOSE) + + +def _string_contents_repl(match: 're.Match[str]') -> str: + group_octal = match.group('octal') + if group_octal is not None: + code_point = int(group_octal, base=8) + return chr(code_point) + + entire_match = match.group(0) + if entire_match == '\\"': + return '"' + elif entire_match == '\\\\': + return '\\' + else: + raise RuntimeError("unexpected match: ", entire_match) + + +def parse_toc_string(str_within_quotes: str) -> str: + """ + Given the a quoted string obtained from a TOC file using + _CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside. + + Backslash substitutions fail gracefully, which is important since cdrdao + string encoding has been found to be flawed as recently as cdrdao 1.2.5 + (2023): + https://github.com/cdrdao/cdrdao/issues/32 + https://github.com/whipper-team/whipper/issues/169 + + This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8 + is passed, provides UTF-8 strings. It also works with older versions as long + as the encoding was ASCII or Latin-1. + + Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake + (garbled characters), just like the older code this function replaced. + """ + return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl, + str_within_quotes) + # header _CATALOG_RE = re.compile(r'^CATALOG "(?P\d+)"$') @@ -208,11 +262,7 @@ class TocFile: m = _CDTEXT_CANDIDATE_RE.search(line) if m: key = m.group('key') - value = m.group('value') - # usually, value is encoded with octal escapes and in latin-1 - # FIXME: other encodings are possible, does cdrdao handle - # them ? - value = value.encode().decode('unicode_escape') + value = parse_toc_string(m.group('value')) if key in table.CDTEXT_FIELDS: # FIXME: consider ISRC separate for now, but this # is a limitation of our parser approach diff --git a/whipper/test/diorama.cue b/whipper/test/diorama.cue new file mode 100644 index 0000000..6d6a9c2 --- /dev/null +++ b/whipper/test/diorama.cue @@ -0,0 +1,45 @@ +REM DISCID 700AC908 +REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827" +PERFORMER "MØL" +TITLE "Diorama" +FILE "data.wav" WAVE + TRACK 01 AUDIO + PERFORMER "MØL" + TITLE "Fraktur" + ISRC DED832100085 + INDEX 01 00:00:00 + TRACK 02 AUDIO + PERFORMER "MØL" + TITLE "Photophobic" + ISRC DED832100086 + INDEX 01 04:19:00 + TRACK 03 AUDIO + PERFORMER "MØL" + TITLE "Serf" + ISRC DED832100087 + INDEX 01 09:37:00 + TRACK 04 AUDIO + PERFORMER "MØL" + TITLE "Vestige" + ISRC DED832100088 + INDEX 01 14:59:12 + TRACK 05 AUDIO + PERFORMER "MØL" + TITLE "Redacted" + ISRC DED832100089 + INDEX 01 20:37:68 + TRACK 06 AUDIO + PERFORMER "MØL" + TITLE "Itinerari" + ISRC DED832100090 + INDEX 01 25:54:18 + TRACK 07 AUDIO + PERFORMER "MØL" + TITLE "Tvesind" + ISRC DED832100091 + INDEX 01 30:57:58 + TRACK 08 AUDIO + PERFORMER "MØL" + TITLE "Diorama" + ISRC DED832100092 + INDEX 01 38:46:57 diff --git a/whipper/test/diorama_noutf8.toc b/whipper/test/diorama_noutf8.toc new file mode 100644 index 0000000..9b03884 --- /dev/null +++ b/whipper/test/diorama_noutf8.toc @@ -0,0 +1,134 @@ +CD_DA + +CD_TEXT { + LANGUAGE_MAP { + 0: 9 + } + LANGUAGE 0 { + TITLE "Diorama" + PERFORMER "M\330L" + SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0} + } +} + +// Track 1 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100085" +CD_TEXT { + LANGUAGE 0 { + TITLE "Fraktur" + PERFORMER "M\330L" + } +} +FILE "data.wav" 0 04:19:00 + + +// Track 2 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100086" +CD_TEXT { + LANGUAGE 0 { + TITLE "Photophobic" + PERFORMER "M\330L" + } +} +FILE "data.wav" 04:19:00 05:18:00 + + +// Track 3 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100087" +CD_TEXT { + LANGUAGE 0 { + TITLE "Serf" + PERFORMER "M\330L" + } +} +FILE "data.wav" 09:37:00 05:22:12 + + +// Track 4 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100088" +CD_TEXT { + LANGUAGE 0 { + TITLE "Vestige" + PERFORMER "M\330L" + } +} +FILE "data.wav" 14:59:12 05:38:56 + + +// Track 5 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100089" +CD_TEXT { + LANGUAGE 0 { + TITLE "Redacted" + PERFORMER "M\330L" + } +} +FILE "data.wav" 20:37:68 05:16:25 + + +// Track 6 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100090" +CD_TEXT { + LANGUAGE 0 { + TITLE "Itinerari" + PERFORMER "M\330L" + } +} +FILE "data.wav" 25:54:18 05:03:40 + + +// Track 7 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100091" +CD_TEXT { + LANGUAGE 0 { + TITLE "Tvesind" + PERFORMER "M\330L" + } +} +FILE "data.wav" 30:57:58 07:48:74 + + +// Track 10 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100092" +CD_TEXT { + LANGUAGE 0 { + TITLE "Diorama" + PERFORMER "M\330L" + } +} +FILE "data.wav" 38:46:57 07:14:36 + diff --git a/whipper/test/diorama_utf8.toc b/whipper/test/diorama_utf8.toc new file mode 100644 index 0000000..4512595 --- /dev/null +++ b/whipper/test/diorama_utf8.toc @@ -0,0 +1,134 @@ +CD_DA + +CD_TEXT { + LANGUAGE_MAP { + 0: 9 + } + LANGUAGE 0 { + TITLE "Diorama" + PERFORMER "MØL" + SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0} + } +} + +// Track 1 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100085" +CD_TEXT { + LANGUAGE 0 { + TITLE "Fraktur" + PERFORMER "MØL" + } +} +FILE "data.wav" 0 04:19:00 + + +// Track 2 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100086" +CD_TEXT { + LANGUAGE 0 { + TITLE "Photophobic" + PERFORMER "MØL" + } +} +FILE "data.wav" 04:19:00 05:18:00 + + +// Track 3 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100087" +CD_TEXT { + LANGUAGE 0 { + TITLE "Serf" + PERFORMER "MØL" + } +} +FILE "data.wav" 09:37:00 05:22:12 + + +// Track 4 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100088" +CD_TEXT { + LANGUAGE 0 { + TITLE "Vestige" + PERFORMER "MØL" + } +} +FILE "data.wav" 14:59:12 05:38:56 + + +// Track 5 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100089" +CD_TEXT { + LANGUAGE 0 { + TITLE "Redacted" + PERFORMER "MØL" + } +} +FILE "data.wav" 20:37:68 05:16:25 + + +// Track 6 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100090" +CD_TEXT { + LANGUAGE 0 { + TITLE "Itinerari" + PERFORMER "MØL" + } +} +FILE "data.wav" 25:54:18 05:03:40 + + +// Track 7 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100091" +CD_TEXT { + LANGUAGE 0 { + TITLE "Tvesind" + PERFORMER "MØL" + } +} +FILE "data.wav" 30:57:58 07:48:74 + + +// Track 8 +TRACK AUDIO +NO COPY +NO PRE_EMPHASIS +TWO_CHANNEL_AUDIO +ISRC "DED832100092" +CD_TEXT { + LANGUAGE 0 { + TITLE "Diorama" + PERFORMER "MØL" + } +} +FILE "data.wav" 38:46:57 07:14:36 + diff --git a/whipper/test/test_image_toc.py b/whipper/test/test_image_toc.py index 177622b..76cb82f 100644 --- a/whipper/test/test_image_toc.py +++ b/whipper/test/test_image_toc.py @@ -5,6 +5,7 @@ import os import copy import shutil import tempfile +from abc import abstractmethod from whipper.image import toc @@ -194,6 +195,72 @@ class BreedersTestCase(common.TestCase): ref = self.readCue('breeders.cue') self.assertEqual(cue, ref) + +class DioramaTOCMixin: + # TODO figure out how to make this class abstract + """ + MØL - Diorama contains CD-Text. + + Two .toc files are provided: + - diorama_utf8.toc (UTF-8 mode, cdrdao 1.2.5) + - diorama_noutf8.toc (--no-utf8 mode, cdrdao 1.2.5, but representative of + any older version of cdrdao) + + Regardless of the version chosen for the toc file, all the same tests should + pass, including generating the same .cue file as output. + """ + + @property + @abstractmethod + def tocFileName(self) -> str: + raise NotImplementedError + + def setUp(self): + self.path = os.path.join(os.path.dirname(__file__), self.tocFileName) + self.toc = toc.TocFile(self.path) + self.toc.parse() + self.assertEqual(len(self.toc.table.tracks), 8) + + def testCDText(self): + cdt = self.toc.table.cdtext + self.assertEqual(cdt['PERFORMER'], 'MØL') + self.assertEqual(cdt['TITLE'], 'Diorama') + + t = self.toc.table.tracks[0] + cdt = t.cdtext + self.assertEqual(cdt['PERFORMER'], 'MØL') + self.assertEqual(cdt['TITLE'], 'Fraktur') + + def testConvertCue(self): + self.assertTrue(self.toc.table.hasTOC()) + cue = self.toc.table.cue() + with open("/tmp/miau.txt", "w") as f: + f.write(cue) + ref = self.readCue('diorama.cue') + self.maxDiff = None + self.assertEqual(cue, ref) + + +class CDTextLatin1TOCTestCase(common.TestCase, common.UnicodeTestMixin, + DioramaTOCMixin): + @property + def tocFileName(self) -> str: + return 'diorama_noutf8.toc' + + def setUp(self): + DioramaTOCMixin.setUp(self) + + +class CDTextUTF8TOCTestCase(common.TestCase, common.UnicodeTestMixin, + DioramaTOCMixin): + @property + def tocFileName(self) -> str: + return 'diorama_utf8.toc' + + def setUp(self): + DioramaTOCMixin.setUp(self) + + # Ladyhawke has a data track diff --git a/whipper/test/toc_cdtext_string_parsing.py b/whipper/test/toc_cdtext_string_parsing.py new file mode 100644 index 0000000..ccfc4a1 --- /dev/null +++ b/whipper/test/toc_cdtext_string_parsing.py @@ -0,0 +1,36 @@ +from whipper.image.toc import _CDTEXT_CANDIDATE_RE, parse_toc_string + +from whipper.test import common + +class TestTOCStringParsing(common.TestCase): + def check_string(self, str_with_quotes: str, str_parsed_expected: str): + text = f"PERFORMER {str_with_quotes}" + match = _CDTEXT_CANDIDATE_RE.match(text) + if not match: + self.fail(f"String wasn't matched: {text}") + self.assertEquals(match.start(), 0) + self.assertEquals(match.end(), len(text)) + + str_parsed_actual = parse_toc_string(match.group("value")) + self.assertEquals(str_parsed_actual, str_parsed_expected) + + def test_simple(self): + self.check_string('"foo bar"', 'foo bar') + + def test_escaped_quotes(self): + self.check_string(r'"the \"foos\""', r'the "foos"') + + def test_escaped_backslash(self): + self.check_string(r'"foo\\bar"', r'foo\bar') + + def test_escaped_latin1(self): + self.check_string(r'"M\330L"', r'MØL') + + def test_incomplete_escape(self): + self.check_string(r'"M\33a"', r'M\33a') + + def test_trailing_backslash(self): + self.check_string(r'"foo\\"', 'foo\\') + + def test_unicode(self): + self.check_string(r'"MØL"', 'MØL') \ No newline at end of file