Merge pull request #633 from ntrrgc/toc-string-parsing
New TOC CD-TEXT string decoding
This commit is contained in:
@@ -33,7 +33,61 @@ import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# shared
|
||||
_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
|
||||
_CDTEXT_CANDIDATE_RE = re.compile(r'''
|
||||
(?P<key>\w+) # CD-TEXT key.
|
||||
\s+
|
||||
"(?P<value> # CD-TEXT value.
|
||||
(?:
|
||||
\\\\ # escaped backslash.
|
||||
| \\" # escaped double-quote.
|
||||
| [^"] # not a double-quote.
|
||||
)+? # the value must not be empty.
|
||||
)"
|
||||
''', flags=re.VERBOSE)
|
||||
|
||||
_STRING_SUBSTITUTIONS_RE = re.compile(r'''
|
||||
\\(?P<octal>[0-8][0-8][0-8])
|
||||
| \\"
|
||||
| \\\\
|
||||
''', flags=re.VERBOSE)
|
||||
|
||||
|
||||
def _string_contents_repl(match: 're.Match[str]') -> str:
|
||||
group_octal = match.group('octal')
|
||||
if group_octal is not None:
|
||||
code_point = int(group_octal, base=8)
|
||||
return chr(code_point)
|
||||
|
||||
entire_match = match.group(0)
|
||||
if entire_match == '\\"':
|
||||
return '"'
|
||||
elif entire_match == '\\\\':
|
||||
return '\\'
|
||||
else:
|
||||
raise RuntimeError("unexpected match: ", entire_match)
|
||||
|
||||
|
||||
def parse_toc_string(str_within_quotes: str) -> str:
|
||||
"""
|
||||
Given the a quoted string obtained from a TOC file using
|
||||
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
|
||||
|
||||
Backslash substitutions fail gracefully, which is important since cdrdao
|
||||
string encoding has been found to be flawed as recently as cdrdao 1.2.5
|
||||
(2023):
|
||||
https://github.com/cdrdao/cdrdao/issues/32
|
||||
https://github.com/whipper-team/whipper/issues/169
|
||||
|
||||
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
|
||||
is passed, provides UTF-8 strings. It also works with older versions as long
|
||||
as the encoding was ASCII or Latin-1.
|
||||
|
||||
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
|
||||
(garbled characters), just like the older code this function replaced.
|
||||
"""
|
||||
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
|
||||
str_within_quotes)
|
||||
|
||||
|
||||
# header
|
||||
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
|
||||
@@ -208,11 +262,7 @@ class TocFile:
|
||||
m = _CDTEXT_CANDIDATE_RE.search(line)
|
||||
if m:
|
||||
key = m.group('key')
|
||||
value = m.group('value')
|
||||
# usually, value is encoded with octal escapes and in latin-1
|
||||
# FIXME: other encodings are possible, does cdrdao handle
|
||||
# them ?
|
||||
value = value.encode().decode('unicode_escape')
|
||||
value = parse_toc_string(m.group('value'))
|
||||
if key in table.CDTEXT_FIELDS:
|
||||
# FIXME: consider ISRC separate for now, but this
|
||||
# is a limitation of our parser approach
|
||||
|
||||
45
whipper/test/diorama.cue
Normal file
45
whipper/test/diorama.cue
Normal file
@@ -0,0 +1,45 @@
|
||||
REM DISCID 700AC908
|
||||
REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
|
||||
PERFORMER "MØL"
|
||||
TITLE "Diorama"
|
||||
FILE "data.wav" WAVE
|
||||
TRACK 01 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Fraktur"
|
||||
ISRC DED832100085
|
||||
INDEX 01 00:00:00
|
||||
TRACK 02 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Photophobic"
|
||||
ISRC DED832100086
|
||||
INDEX 01 04:19:00
|
||||
TRACK 03 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Serf"
|
||||
ISRC DED832100087
|
||||
INDEX 01 09:37:00
|
||||
TRACK 04 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Vestige"
|
||||
ISRC DED832100088
|
||||
INDEX 01 14:59:12
|
||||
TRACK 05 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Redacted"
|
||||
ISRC DED832100089
|
||||
INDEX 01 20:37:68
|
||||
TRACK 06 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Itinerari"
|
||||
ISRC DED832100090
|
||||
INDEX 01 25:54:18
|
||||
TRACK 07 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Tvesind"
|
||||
ISRC DED832100091
|
||||
INDEX 01 30:57:58
|
||||
TRACK 08 AUDIO
|
||||
PERFORMER "MØL"
|
||||
TITLE "Diorama"
|
||||
ISRC DED832100092
|
||||
INDEX 01 38:46:57
|
||||
134
whipper/test/diorama_noutf8.toc
Normal file
134
whipper/test/diorama_noutf8.toc
Normal file
@@ -0,0 +1,134 @@
|
||||
CD_DA
|
||||
|
||||
CD_TEXT {
|
||||
LANGUAGE_MAP {
|
||||
0: 9
|
||||
}
|
||||
LANGUAGE 0 {
|
||||
TITLE "Diorama"
|
||||
PERFORMER "M\330L"
|
||||
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
|
||||
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
|
||||
}
|
||||
}
|
||||
|
||||
// Track 1
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100085"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Fraktur"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 0 04:19:00
|
||||
|
||||
|
||||
// Track 2
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100086"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Photophobic"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 04:19:00 05:18:00
|
||||
|
||||
|
||||
// Track 3
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100087"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Serf"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 09:37:00 05:22:12
|
||||
|
||||
|
||||
// Track 4
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100088"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Vestige"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 14:59:12 05:38:56
|
||||
|
||||
|
||||
// Track 5
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100089"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Redacted"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 20:37:68 05:16:25
|
||||
|
||||
|
||||
// Track 6
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100090"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Itinerari"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 25:54:18 05:03:40
|
||||
|
||||
|
||||
// Track 7
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100091"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Tvesind"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 30:57:58 07:48:74
|
||||
|
||||
|
||||
// Track 10
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100092"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Diorama"
|
||||
PERFORMER "M\330L"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 38:46:57 07:14:36
|
||||
|
||||
134
whipper/test/diorama_utf8.toc
Normal file
134
whipper/test/diorama_utf8.toc
Normal file
@@ -0,0 +1,134 @@
|
||||
CD_DA
|
||||
|
||||
CD_TEXT {
|
||||
LANGUAGE_MAP {
|
||||
0: 9
|
||||
}
|
||||
LANGUAGE 0 {
|
||||
TITLE "Diorama"
|
||||
PERFORMER "MØL"
|
||||
SIZE_INFO { 0, 1, 8, 0, 7, 3, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 3, 12, 0, 0, 0,
|
||||
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0}
|
||||
}
|
||||
}
|
||||
|
||||
// Track 1
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100085"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Fraktur"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 0 04:19:00
|
||||
|
||||
|
||||
// Track 2
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100086"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Photophobic"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 04:19:00 05:18:00
|
||||
|
||||
|
||||
// Track 3
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100087"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Serf"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 09:37:00 05:22:12
|
||||
|
||||
|
||||
// Track 4
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100088"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Vestige"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 14:59:12 05:38:56
|
||||
|
||||
|
||||
// Track 5
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100089"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Redacted"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 20:37:68 05:16:25
|
||||
|
||||
|
||||
// Track 6
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100090"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Itinerari"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 25:54:18 05:03:40
|
||||
|
||||
|
||||
// Track 7
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100091"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Tvesind"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 30:57:58 07:48:74
|
||||
|
||||
|
||||
// Track 8
|
||||
TRACK AUDIO
|
||||
NO COPY
|
||||
NO PRE_EMPHASIS
|
||||
TWO_CHANNEL_AUDIO
|
||||
ISRC "DED832100092"
|
||||
CD_TEXT {
|
||||
LANGUAGE 0 {
|
||||
TITLE "Diorama"
|
||||
PERFORMER "MØL"
|
||||
}
|
||||
}
|
||||
FILE "data.wav" 38:46:57 07:14:36
|
||||
|
||||
@@ -5,6 +5,7 @@ import os
|
||||
import copy
|
||||
import shutil
|
||||
import tempfile
|
||||
from abc import abstractmethod
|
||||
|
||||
from whipper.image import toc
|
||||
|
||||
@@ -194,6 +195,72 @@ class BreedersTestCase(common.TestCase):
|
||||
ref = self.readCue('breeders.cue')
|
||||
self.assertEqual(cue, ref)
|
||||
|
||||
|
||||
class DioramaTOCMixin:
|
||||
# TODO figure out how to make this class abstract
|
||||
"""
|
||||
MØL - Diorama contains CD-Text.
|
||||
|
||||
Two .toc files are provided:
|
||||
- diorama_utf8.toc (UTF-8 mode, cdrdao 1.2.5)
|
||||
- diorama_noutf8.toc (--no-utf8 mode, cdrdao 1.2.5, but representative of
|
||||
any older version of cdrdao)
|
||||
|
||||
Regardless of the version chosen for the toc file, all the same tests should
|
||||
pass, including generating the same .cue file as output.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def tocFileName(self) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
def setUp(self):
|
||||
self.path = os.path.join(os.path.dirname(__file__), self.tocFileName)
|
||||
self.toc = toc.TocFile(self.path)
|
||||
self.toc.parse()
|
||||
self.assertEqual(len(self.toc.table.tracks), 8)
|
||||
|
||||
def testCDText(self):
|
||||
cdt = self.toc.table.cdtext
|
||||
self.assertEqual(cdt['PERFORMER'], 'MØL')
|
||||
self.assertEqual(cdt['TITLE'], 'Diorama')
|
||||
|
||||
t = self.toc.table.tracks[0]
|
||||
cdt = t.cdtext
|
||||
self.assertEqual(cdt['PERFORMER'], 'MØL')
|
||||
self.assertEqual(cdt['TITLE'], 'Fraktur')
|
||||
|
||||
def testConvertCue(self):
|
||||
self.assertTrue(self.toc.table.hasTOC())
|
||||
cue = self.toc.table.cue()
|
||||
with open("/tmp/miau.txt", "w") as f:
|
||||
f.write(cue)
|
||||
ref = self.readCue('diorama.cue')
|
||||
self.maxDiff = None
|
||||
self.assertEqual(cue, ref)
|
||||
|
||||
|
||||
class CDTextLatin1TOCTestCase(common.TestCase, common.UnicodeTestMixin,
|
||||
DioramaTOCMixin):
|
||||
@property
|
||||
def tocFileName(self) -> str:
|
||||
return 'diorama_noutf8.toc'
|
||||
|
||||
def setUp(self):
|
||||
DioramaTOCMixin.setUp(self)
|
||||
|
||||
|
||||
class CDTextUTF8TOCTestCase(common.TestCase, common.UnicodeTestMixin,
|
||||
DioramaTOCMixin):
|
||||
@property
|
||||
def tocFileName(self) -> str:
|
||||
return 'diorama_utf8.toc'
|
||||
|
||||
def setUp(self):
|
||||
DioramaTOCMixin.setUp(self)
|
||||
|
||||
|
||||
# Ladyhawke has a data track
|
||||
|
||||
|
||||
|
||||
36
whipper/test/toc_cdtext_string_parsing.py
Normal file
36
whipper/test/toc_cdtext_string_parsing.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from whipper.image.toc import _CDTEXT_CANDIDATE_RE, parse_toc_string
|
||||
|
||||
from whipper.test import common
|
||||
|
||||
class TestTOCStringParsing(common.TestCase):
|
||||
def check_string(self, str_with_quotes: str, str_parsed_expected: str):
|
||||
text = f"PERFORMER {str_with_quotes}"
|
||||
match = _CDTEXT_CANDIDATE_RE.match(text)
|
||||
if not match:
|
||||
self.fail(f"String wasn't matched: {text}")
|
||||
self.assertEquals(match.start(), 0)
|
||||
self.assertEquals(match.end(), len(text))
|
||||
|
||||
str_parsed_actual = parse_toc_string(match.group("value"))
|
||||
self.assertEquals(str_parsed_actual, str_parsed_expected)
|
||||
|
||||
def test_simple(self):
|
||||
self.check_string('"foo bar"', 'foo bar')
|
||||
|
||||
def test_escaped_quotes(self):
|
||||
self.check_string(r'"the \"foos\""', r'the "foos"')
|
||||
|
||||
def test_escaped_backslash(self):
|
||||
self.check_string(r'"foo\\bar"', r'foo\bar')
|
||||
|
||||
def test_escaped_latin1(self):
|
||||
self.check_string(r'"M\330L"', r'MØL')
|
||||
|
||||
def test_incomplete_escape(self):
|
||||
self.check_string(r'"M\33a"', r'M\33a')
|
||||
|
||||
def test_trailing_backslash(self):
|
||||
self.check_string(r'"foo\\"', 'foo\\')
|
||||
|
||||
def test_unicode(self):
|
||||
self.check_string(r'"MØL"', 'MØL')
|
||||
Reference in New Issue
Block a user