Merge pull request #633 from ntrrgc/toc-string-parsing

New TOC CD-TEXT string decoding
This commit is contained in:
Merlijn Wajer
2024-11-25 16:51:36 +01:00
committed by GitHub
6 changed files with 472 additions and 6 deletions

View File

@@ -33,7 +33,61 @@ import logging
logger = logging.getLogger(__name__)
# shared
_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
_CDTEXT_CANDIDATE_RE = re.compile(r'''
(?P<key>\w+) # CD-TEXT key.
\s+
"(?P<value> # CD-TEXT value.
(?:
\\\\ # escaped backslash.
| \\" # escaped double-quote.
| [^"] # not a double-quote.
)+? # the value must not be empty.
)"
''', flags=re.VERBOSE)
_STRING_SUBSTITUTIONS_RE = re.compile(r'''
\\(?P<octal>[0-8][0-8][0-8])
| \\"
| \\\\
''', flags=re.VERBOSE)
def _string_contents_repl(match: 're.Match[str]') -> str:
group_octal = match.group('octal')
if group_octal is not None:
code_point = int(group_octal, base=8)
return chr(code_point)
entire_match = match.group(0)
if entire_match == '\\"':
return '"'
elif entire_match == '\\\\':
return '\\'
else:
raise RuntimeError("unexpected match: ", entire_match)
def parse_toc_string(str_within_quotes: str) -> str:
"""
Given the a quoted string obtained from a TOC file using
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
Backslash substitutions fail gracefully, which is important since cdrdao
string encoding has been found to be flawed as recently as cdrdao 1.2.5
(2023):
https://github.com/cdrdao/cdrdao/issues/32
https://github.com/whipper-team/whipper/issues/169
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
is passed, provides UTF-8 strings. It also works with older versions as long
as the encoding was ASCII or Latin-1.
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
(garbled characters), just like the older code this function replaced.
"""
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
str_within_quotes)
# header
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
@@ -208,11 +262,7 @@ class TocFile:
m = _CDTEXT_CANDIDATE_RE.search(line)
if m:
key = m.group('key')
value = m.group('value')
# usually, value is encoded with octal escapes and in latin-1
# FIXME: other encodings are possible, does cdrdao handle
# them ?
value = value.encode().decode('unicode_escape')
value = parse_toc_string(m.group('value'))
if key in table.CDTEXT_FIELDS:
# FIXME: consider ISRC separate for now, but this
# is a limitation of our parser approach

45
whipper/test/diorama.cue Normal file
View File

@@ -0,0 +1,45 @@
REM DISCID 700AC908
REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
PERFORMER "MØL"
TITLE "Diorama"
FILE "data.wav" WAVE
TRACK 01 AUDIO
PERFORMER "MØL"
TITLE "Fraktur"
ISRC DED832100085
INDEX 01 00:00:00
TRACK 02 AUDIO
PERFORMER "MØL"
TITLE "Photophobic"
ISRC DED832100086
INDEX 01 04:19:00
TRACK 03 AUDIO
PERFORMER "MØL"
TITLE "Serf"
ISRC DED832100087
INDEX 01 09:37:00
TRACK 04 AUDIO
PERFORMER "MØL"
TITLE "Vestige"
ISRC DED832100088
INDEX 01 14:59:12
TRACK 05 AUDIO
PERFORMER "MØL"
TITLE "Redacted"
ISRC DED832100089
INDEX 01 20:37:68
TRACK 06 AUDIO
PERFORMER "MØL"
TITLE "Itinerari"
ISRC DED832100090
INDEX 01 25:54:18
TRACK 07 AUDIO
PERFORMER "MØL"
TITLE "Tvesind"
ISRC DED832100091
INDEX 01 30:57:58
TRACK 08 AUDIO
PERFORMER "MØL"
TITLE "Diorama"
ISRC DED832100092
INDEX 01 38:46:57

View File

@@ -0,0 +1,134 @@
CD_DA
CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}
// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "M\330L"
}
}
FILE "data.wav" 0 04:19:00
// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "M\330L"
}
}
FILE "data.wav" 04:19:00 05:18:00
// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "M\330L"
}
}
FILE "data.wav" 09:37:00 05:22:12
// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "M\330L"
}
}
FILE "data.wav" 14:59:12 05:38:56
// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "M\330L"
}
}
FILE "data.wav" 20:37:68 05:16:25
// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "M\330L"
}
}
FILE "data.wav" 25:54:18 05:03:40
// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "M\330L"
}
}
FILE "data.wav" 30:57:58 07:48:74
// Track 10
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "M\330L"
}
}
FILE "data.wav" 38:46:57 07:14:36

View File

@@ -0,0 +1,134 @@
CD_DA
CD_TEXT {
LANGUAGE_MAP {
0: 9
}
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
}
}
// Track 1
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100085"
CD_TEXT {
LANGUAGE 0 {
TITLE "Fraktur"
PERFORMER "MØL"
}
}
FILE "data.wav" 0 04:19:00
// Track 2
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100086"
CD_TEXT {
LANGUAGE 0 {
TITLE "Photophobic"
PERFORMER "MØL"
}
}
FILE "data.wav" 04:19:00 05:18:00
// Track 3
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100087"
CD_TEXT {
LANGUAGE 0 {
TITLE "Serf"
PERFORMER "MØL"
}
}
FILE "data.wav" 09:37:00 05:22:12
// Track 4
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100088"
CD_TEXT {
LANGUAGE 0 {
TITLE "Vestige"
PERFORMER "MØL"
}
}
FILE "data.wav" 14:59:12 05:38:56
// Track 5
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100089"
CD_TEXT {
LANGUAGE 0 {
TITLE "Redacted"
PERFORMER "MØL"
}
}
FILE "data.wav" 20:37:68 05:16:25
// Track 6
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100090"
CD_TEXT {
LANGUAGE 0 {
TITLE "Itinerari"
PERFORMER "MØL"
}
}
FILE "data.wav" 25:54:18 05:03:40
// Track 7
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100091"
CD_TEXT {
LANGUAGE 0 {
TITLE "Tvesind"
PERFORMER "MØL"
}
}
FILE "data.wav" 30:57:58 07:48:74
// Track 8
TRACK AUDIO
NO COPY
NO PRE_EMPHASIS
TWO_CHANNEL_AUDIO
ISRC "DED832100092"
CD_TEXT {
LANGUAGE 0 {
TITLE "Diorama"
PERFORMER "MØL"
}
}
FILE "data.wav" 38:46:57 07:14:36

View File

@@ -5,6 +5,7 @@ import os
import copy
import shutil
import tempfile
from abc import abstractmethod
from whipper.image import toc
@@ -194,6 +195,72 @@ class BreedersTestCase(common.TestCase):
ref = self.readCue('breeders.cue')
self.assertEqual(cue, ref)
class DioramaTOCMixin:
# TODO figure out how to make this class abstract
"""
MØL - Diorama contains CD-Text.
Two .toc files are provided:
- diorama_utf8.toc (UTF-8 mode, cdrdao 1.2.5)
- diorama_noutf8.toc (--no-utf8 mode, cdrdao 1.2.5, but representative of
any older version of cdrdao)
Regardless of the version chosen for the toc file, all the same tests should
pass, including generating the same .cue file as output.
"""
@property
@abstractmethod
def tocFileName(self) -> str:
raise NotImplementedError
def setUp(self):
self.path = os.path.join(os.path.dirname(__file__), self.tocFileName)
self.toc = toc.TocFile(self.path)
self.toc.parse()
self.assertEqual(len(self.toc.table.tracks), 8)
def testCDText(self):
cdt = self.toc.table.cdtext
self.assertEqual(cdt['PERFORMER'], 'MØL')
self.assertEqual(cdt['TITLE'], 'Diorama')
t = self.toc.table.tracks[0]
cdt = t.cdtext
self.assertEqual(cdt['PERFORMER'], 'MØL')
self.assertEqual(cdt['TITLE'], 'Fraktur')
def testConvertCue(self):
self.assertTrue(self.toc.table.hasTOC())
cue = self.toc.table.cue()
with open("/tmp/miau.txt", "w") as f:
f.write(cue)
ref = self.readCue('diorama.cue')
self.maxDiff = None
self.assertEqual(cue, ref)
class CDTextLatin1TOCTestCase(common.TestCase, common.UnicodeTestMixin,
DioramaTOCMixin):
@property
def tocFileName(self) -> str:
return 'diorama_noutf8.toc'
def setUp(self):
DioramaTOCMixin.setUp(self)
class CDTextUTF8TOCTestCase(common.TestCase, common.UnicodeTestMixin,
DioramaTOCMixin):
@property
def tocFileName(self) -> str:
return 'diorama_utf8.toc'
def setUp(self):
DioramaTOCMixin.setUp(self)
# Ladyhawke has a data track

View File

@@ -0,0 +1,36 @@
from whipper.image.toc import _CDTEXT_CANDIDATE_RE, parse_toc_string
from whipper.test import common
class TestTOCStringParsing(common.TestCase):
def check_string(self, str_with_quotes: str, str_parsed_expected: str):
text = f"PERFORMER {str_with_quotes}"
match = _CDTEXT_CANDIDATE_RE.match(text)
if not match:
self.fail(f"String wasn't matched: {text}")
self.assertEquals(match.start(), 0)
self.assertEquals(match.end(), len(text))
str_parsed_actual = parse_toc_string(match.group("value"))
self.assertEquals(str_parsed_actual, str_parsed_expected)
def test_simple(self):
self.check_string('"foo bar"', 'foo bar')
def test_escaped_quotes(self):
self.check_string(r'"the \"foos\""', r'the "foos"')
def test_escaped_backslash(self):
self.check_string(r'"foo\\bar"', r'foo\bar')
def test_escaped_latin1(self):
self.check_string(r'"M\330L"', r'MØL')
def test_incomplete_escape(self):
self.check_string(r'"M\33a"', r'M\33a')
def test_trailing_backslash(self):
self.check_string(r'"foo\\"', 'foo\\')
def test_unicode(self):
self.check_string(r'"MØL"', 'MØL')