This patch replaces the previous broken approach to TOC string decoding
that used `.encode().decode('unicode_escape')` with proper parsing of
the escape sequences cdrdao is known to generate.
The new parser is also lenient with invalid escape sequences, that can
occur due to improper escaping in cdrdao. See:
https://github.com/cdrdao/cdrdao/issues/32
Latin-1:
This new parsing method should work for Latin-1 strings for both old and
new versions of cdrdao, as long as those strings don't trigger the
improper escaping issues in upstream cdrdao.
This has been verified with the album Diorama from the Danish black
metal band MØL.
MS-JIS:
This new parsing method should also work for MS-JIS strings as long as
the .toc file was generated by cdrdao 1.2.5+ and the strings don't
trigger improper escaping issues in upstream cdrdao.
Unfortunately, I don't have any CD with CD-Text in MS-JIS, so I could
not verify this.
cdrdao versions before 1.2.5 will still cause whipper to produce
mojibake (garbled characters) when reading MS-JIS CD-Text, as those
versions do not encode strings in UTF-8.
Other encodings:
As far as I know, CD-Text only supports officially ASCII, Latin-1 and
MS-JIS, but I wouldn't be surprised if there are unofficial encodings
out there, given the strange strings I've seen in some bug reports.
If you have a CD with garbled CD-Text, please submit a bug report
indicating the performer, album name, language and attach the .toc file
so that the produced strings can be compared to the expected text.
Fixes https://github.com/whipper-team/whipper/issues/169
Signed-off-by: Alicia Boya García <ntrrgc@gmail.com>
508 lines
18 KiB
Python
508 lines
18 KiB
Python
# -*- Mode: Python; test-case-name: whipper.test.test_image_toc -*-
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
# Copyright (C) 2009 Thomas Vander Stichele
|
|
|
|
# This file is part of whipper.
|
|
#
|
|
# whipper is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# whipper is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with whipper. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
"""
|
|
Read .toc files.
|
|
|
|
The .toc file format is described in the man page of cdrdao.
|
|
"""
|
|
|
|
import re
|
|
|
|
from whipper.common import common
|
|
from whipper.image import table
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# shared
|
|
_CDTEXT_CANDIDATE_RE = re.compile(r'''
|
|
(?P<key>\w+) # CD-TEXT key.
|
|
\s+
|
|
"(?P<value> # CD-TEXT value.
|
|
(?:
|
|
\\\\ # escaped backslash.
|
|
| \\" # escaped double-quote.
|
|
| [^"] # not a double-quote.
|
|
)+? # the value must not be empty.
|
|
)"
|
|
''', flags=re.VERBOSE)
|
|
|
|
_STRING_SUBSTITUTIONS_RE = re.compile(r'''
|
|
\\(?P<octal>[0-8][0-8][0-8])
|
|
| \\"
|
|
| \\\\
|
|
''', flags=re.VERBOSE)
|
|
|
|
|
|
def _string_contents_repl(match: 're.Match[str]') -> str:
|
|
group_octal = match.group('octal')
|
|
if group_octal is not None:
|
|
code_point = int(group_octal, base=8)
|
|
return chr(code_point)
|
|
|
|
entire_match = match.group(0)
|
|
if entire_match == '\\"':
|
|
return '"'
|
|
elif entire_match == '\\\\':
|
|
return '\\'
|
|
else:
|
|
raise RuntimeError("unexpected match: ", entire_match)
|
|
|
|
|
|
def parse_toc_string(str_within_quotes: str) -> str:
|
|
"""
|
|
Given the a quoted string obtained from a TOC file using
|
|
_CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
|
|
|
|
Backslash substitutions fail gracefully, which is important since cdrdao
|
|
string encoding has been found to be flawed as recently as cdrdao 1.2.5
|
|
(2023):
|
|
https://github.com/cdrdao/cdrdao/issues/32
|
|
https://github.com/whipper-team/whipper/issues/169
|
|
|
|
This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
|
|
is passed, provides UTF-8 strings. It also works with older versions as long
|
|
as the encoding was ASCII or Latin-1.
|
|
|
|
Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
|
|
(garbled characters), just like the older code this function replaced.
|
|
"""
|
|
return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
|
|
str_within_quotes)
|
|
|
|
|
|
# header
|
|
_CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
|
|
|
|
# pre emphasis
|
|
_PRE_EMPHASIS_RE = re.compile(r'^PRE_EMPHASIS$')
|
|
|
|
# records
|
|
_TRACK_RE = re.compile(r"""
|
|
^TRACK # TRACK
|
|
\s(?P<mode>.+)$ # mode (AUDIO, MODE2_FORM_MIX, MODEx/2xxx, ...)
|
|
""", re.VERBOSE)
|
|
|
|
_ISRC_RE = re.compile(r'^ISRC "(?P<isrc>\w+)"$')
|
|
|
|
# a HTOA is marked in the cdrdao's TOC as SILENCE
|
|
_SILENCE_RE = re.compile(r"""
|
|
^SILENCE # SILENCE
|
|
\s(?P<length>.*)$ # pre-gap length
|
|
""", re.VERBOSE)
|
|
|
|
# ZERO is used as pre-gap source when switching mode
|
|
_ZERO_RE = re.compile(r"""
|
|
^ZERO # ZERO
|
|
\s(?P<mode>.+) # mode (AUDIO, MODEx/2xxx, ...)
|
|
\s(?P<length>.*)$ # zero length
|
|
""", re.VERBOSE)
|
|
|
|
|
|
_FILE_RE = re.compile(r"""
|
|
^FILE # FILE
|
|
\s+"(?P<name>.*)" # 'file name' in quotes
|
|
\s+(?P<start>.+) # start offset
|
|
\s(?P<length>.+)$ # length in frames of section
|
|
""", re.VERBOSE)
|
|
|
|
_DATAFILE_RE = re.compile(r"""
|
|
^DATAFILE # DATA FILE
|
|
\s+"(?P<name>.*)" # 'file name' in quotes
|
|
\s+(?P<length>\S+) # start offset
|
|
\s*.* # possible // comment
|
|
""", re.VERBOSE)
|
|
|
|
|
|
# FIXME: start can be 0
|
|
_START_RE = re.compile(r"""
|
|
^START # START
|
|
\s(?P<length>.*)$ # pre-gap length
|
|
""", re.VERBOSE)
|
|
|
|
|
|
_INDEX_RE = re.compile(r"""
|
|
^INDEX # INDEX
|
|
\s(?P<offset>.+)$ # start offset
|
|
""", re.VERBOSE)
|
|
|
|
|
|
class Sources:
|
|
"""
|
|
Represent the list of sources used in the .toc file.
|
|
|
|
Each SILENCE and each FILE is a source.
|
|
If the filename for FILE doesn't change, the counter is not increased.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._sources = []
|
|
|
|
def append(self, counter, offset, source):
|
|
"""
|
|
Append ``(counter, offset, source)`` tuple to the ``sources`` list.
|
|
|
|
:param counter: the source counter; updates for each different
|
|
data source (silence or different file path)
|
|
:type counter: int
|
|
:param offset: the absolute disc offset where this source starts
|
|
:type offset: int
|
|
:param source: data source
|
|
:type source: File or None
|
|
"""
|
|
logger.debug('appending source, counter %d, abs offset %d, '
|
|
'source %r', counter, offset, source)
|
|
self._sources.append((counter, offset, source))
|
|
|
|
def get(self, offset):
|
|
"""Retrieve the source used at the given offset."""
|
|
for i, (_, o, _) in enumerate(self._sources):
|
|
if offset < o:
|
|
return self._sources[i - 1]
|
|
|
|
return self._sources[-1]
|
|
|
|
def getCounterStart(self, counter):
|
|
"""
|
|
Retrieve the absolute offset of the first source for this counter.
|
|
|
|
:param counter: the source counter; updates for each different
|
|
data source (silence or different file path)
|
|
:type counter: int
|
|
"""
|
|
for i, (c, _, _) in enumerate(self._sources):
|
|
if c == counter:
|
|
return self._sources[i][1]
|
|
|
|
return self._sources[-1][1]
|
|
|
|
|
|
class TocFile:
|
|
|
|
def __init__(self, path):
|
|
"""
|
|
Init TocFile.
|
|
|
|
:param path: path to track
|
|
:type path: str
|
|
"""
|
|
assert isinstance(path, str), "%r is not str" % path
|
|
self._path = path
|
|
self._messages = []
|
|
self.table = table.Table()
|
|
self.logName = '<TocFile %08x>' % id(self)
|
|
|
|
self._sources = Sources()
|
|
|
|
def _index(self, currentTrack, i, absoluteOffset, trackOffset):
|
|
absolute = absoluteOffset + trackOffset
|
|
# this may be in a new source, so calculate relative
|
|
c, _, s = self._sources.get(absolute)
|
|
logger.debug('at abs offset %d, we are in source %r',
|
|
absolute, s)
|
|
counterStart = self._sources.getCounterStart(c)
|
|
relative = absolute - counterStart
|
|
|
|
currentTrack.index(i, path=s.path,
|
|
absolute=absolute,
|
|
relative=relative,
|
|
counter=c)
|
|
logger.debug('[track %02d index %02d] trackOffset %r, added %r',
|
|
currentTrack.number, i, trackOffset,
|
|
currentTrack.getIndex(i))
|
|
|
|
def parse(self):
|
|
currentFile = None
|
|
currentTrack = None
|
|
|
|
state = 'HEADER'
|
|
# counts sources for audio data; SILENCE/ZERO/FILE
|
|
counter = 0
|
|
trackNumber = 0
|
|
indexNumber = 0
|
|
# running absolute offset: where each track starts
|
|
absoluteOffset = 0
|
|
# running relative offset, relative to counter src
|
|
relativeOffset = 0
|
|
# currentLength is accrued during TRACK record parsing length
|
|
# of current track as parsed so far reset on each TRACK statement
|
|
currentLength = 0
|
|
# accrued during TRACK record parsing, total disc
|
|
totalLength = 0
|
|
# length of the pre-gap, current track in for loop
|
|
pregapLength = 0
|
|
|
|
# the first track's INDEX 1 can only be gotten from the .toc
|
|
# file once the first pregap is calculated; so we add INDEX 1
|
|
# at the end of each parsed TRACK record
|
|
with open(self._path) as f:
|
|
content = f.readlines()
|
|
for number, line in enumerate(content):
|
|
line = line.rstrip()
|
|
|
|
# look for CDTEXT stuff in either header or tracks
|
|
m = _CDTEXT_CANDIDATE_RE.search(line)
|
|
if m:
|
|
key = m.group('key')
|
|
value = parse_toc_string(m.group('value'))
|
|
if key in table.CDTEXT_FIELDS:
|
|
# FIXME: consider ISRC separate for now, but this
|
|
# is a limitation of our parser approach
|
|
if state == 'HEADER':
|
|
self.table.cdtext[key] = value
|
|
logger.debug('found disc CD-Text %s: %r', key, value)
|
|
elif state == 'TRACK':
|
|
if key != 'ISRC' or not currentTrack \
|
|
or currentTrack.isrc is not None:
|
|
logger.debug('found track CD-Text %s: %r',
|
|
key, value)
|
|
currentTrack.cdtext[key] = value
|
|
|
|
# look for header elements
|
|
m = _CATALOG_RE.search(line)
|
|
if m:
|
|
self.table.catalog = m.group('catalog')
|
|
logger.debug("found catalog number %s", self.table.catalog)
|
|
|
|
# look for TRACK lines
|
|
m = _TRACK_RE.search(line)
|
|
if m:
|
|
state = 'TRACK'
|
|
|
|
# set index 1 of previous track if there was one, using
|
|
# pregapLength if applicable
|
|
if currentTrack:
|
|
self._index(currentTrack, 1, absoluteOffset, pregapLength)
|
|
|
|
# create a new track to be filled by later lines
|
|
trackNumber += 1
|
|
trackMode = m.group('mode')
|
|
audio = trackMode == 'AUDIO'
|
|
currentTrack = table.Track(trackNumber, audio=audio)
|
|
self.table.tracks.append(currentTrack)
|
|
|
|
# update running totals
|
|
absoluteOffset += currentLength
|
|
relativeOffset += currentLength
|
|
totalLength += currentLength
|
|
|
|
# FIXME: track mode
|
|
logger.debug('found track %d, mode %s, at absoluteOffset %d',
|
|
trackNumber, trackMode, absoluteOffset)
|
|
|
|
# reset counters relative to a track
|
|
currentLength = 0
|
|
indexNumber = 1
|
|
pregapLength = 0
|
|
|
|
continue
|
|
|
|
# look for PRE_EMPHASIS lines
|
|
m = _PRE_EMPHASIS_RE.search(line)
|
|
if m:
|
|
currentTrack.pre_emphasis = True
|
|
logger.debug('track has PRE_EMPHASIS')
|
|
|
|
# look for ISRC lines
|
|
m = _ISRC_RE.search(line)
|
|
if m:
|
|
isrc = m.group('isrc')
|
|
currentTrack.isrc = isrc
|
|
logger.debug('found ISRC code %s', isrc)
|
|
|
|
# look for SILENCE lines
|
|
m = _SILENCE_RE.search(line)
|
|
if m:
|
|
length = m.group('length')
|
|
logger.debug('silence of %r', length)
|
|
self._sources.append(counter, absoluteOffset, None)
|
|
if currentFile is not None:
|
|
logger.debug('silence after file, increasing counter')
|
|
counter += 1
|
|
relativeOffset = 0
|
|
currentFile = None
|
|
currentLength += common.msfToFrames(length)
|
|
|
|
# look for ZERO lines
|
|
m = _ZERO_RE.search(line)
|
|
if m:
|
|
if currentFile is not None:
|
|
logger.debug('zero after file, increasing counter')
|
|
counter += 1
|
|
relativeOffset = 0
|
|
currentFile = None
|
|
length = m.group('length')
|
|
currentLength += common.msfToFrames(length)
|
|
|
|
# look for FILE lines
|
|
m = _FILE_RE.search(line)
|
|
if m:
|
|
filePath = m.group('name')
|
|
start = m.group('start')
|
|
length = m.group('length')
|
|
logger.debug('file %s, start %r, length %r',
|
|
filePath, common.msfToFrames(start),
|
|
common.msfToFrames(length))
|
|
if not currentFile or filePath != currentFile.path:
|
|
counter += 1
|
|
relativeOffset = 0
|
|
logger.debug('track %d, switched to new file, '
|
|
'increased counter to %d',
|
|
trackNumber, counter)
|
|
currentFile = File(filePath, common.msfToFrames(start),
|
|
common.msfToFrames(length))
|
|
self._sources.append(counter, absoluteOffset + currentLength,
|
|
currentFile)
|
|
currentLength += common.msfToFrames(length)
|
|
|
|
# look for DATAFILE lines
|
|
m = _DATAFILE_RE.search(line)
|
|
if m:
|
|
filePath = m.group('name')
|
|
length = m.group('length')
|
|
logger.debug('file %s, length %r',
|
|
filePath, common.msfToFrames(length))
|
|
if not currentFile or filePath != currentFile.path:
|
|
counter += 1
|
|
relativeOffset = 0
|
|
logger.debug('track %d, switched to new file, '
|
|
'increased counter to %d',
|
|
trackNumber, counter)
|
|
# FIXME: assume that a MODE2_FORM_MIX track always starts at 0
|
|
currentFile = File(filePath, 0, common.msfToFrames(length))
|
|
self._sources.append(counter, absoluteOffset + currentLength,
|
|
currentFile)
|
|
currentLength += common.msfToFrames(length)
|
|
|
|
# look for START lines
|
|
m = _START_RE.search(line)
|
|
if m:
|
|
if not currentTrack:
|
|
self.message(number, 'START without preceding TRACK')
|
|
print('ouch')
|
|
continue
|
|
|
|
length = common.msfToFrames(m.group('length'))
|
|
c, _, s = self._sources.get(absoluteOffset)
|
|
logger.debug('at abs offset %d, we are in source %r',
|
|
absoluteOffset, s)
|
|
counterStart = self._sources.getCounterStart(c)
|
|
relativeOffset = absoluteOffset - counterStart
|
|
|
|
currentTrack.index(0, path=s and s.path or None,
|
|
absolute=absoluteOffset,
|
|
relative=relativeOffset, counter=c)
|
|
logger.debug('[track %02d index 00] added %r',
|
|
currentTrack.number, currentTrack.getIndex(0))
|
|
# store the pregapLength to add it when we index 1 for this
|
|
# track on the next iteration
|
|
pregapLength = length
|
|
|
|
# look for INDEX lines
|
|
m = _INDEX_RE.search(line)
|
|
if m:
|
|
if not currentTrack:
|
|
self.message(number, 'INDEX without preceding TRACK')
|
|
print('ouch')
|
|
continue
|
|
|
|
indexNumber += 1
|
|
offset = common.msfToFrames(m.group('offset'))
|
|
self._index(currentTrack, indexNumber, absoluteOffset, offset)
|
|
|
|
# handle index 1 of final track, if any
|
|
if currentTrack:
|
|
self._index(currentTrack, 1, absoluteOffset, pregapLength)
|
|
|
|
# totalLength was added up to the penultimate track
|
|
self.table.leadout = totalLength + currentLength
|
|
logger.debug('parse: leadout: %r', self.table.leadout)
|
|
|
|
def message(self, number, message):
|
|
"""
|
|
Add a message about a given line in the cue file.
|
|
|
|
:param number: line number, counting from 0
|
|
:type number: int
|
|
:param message: a text line in the cue sheet
|
|
:type message: str
|
|
"""
|
|
self._messages.append((number + 1, message))
|
|
|
|
def getTrackLength(self, track):
|
|
"""
|
|
Return the length of the given track, in CD frames.
|
|
|
|
The track length is calculated from its INDEX 01 to the next
|
|
track's INDEX 01.
|
|
"""
|
|
# returns track length in frames, or -1 if can't be determined and
|
|
# complete file should be assumed
|
|
# FIXME: this assumes a track can only be in one file; is this true ?
|
|
i = self.table.tracks.index(track)
|
|
if i == len(self.table.tracks) - 1:
|
|
# last track, so no length known
|
|
return -1
|
|
|
|
thisIndex = track.indexes[1] # FIXME: could be more
|
|
nextIndex = self.table.tracks[i + 1].indexes[1] # FIXME: could be 0
|
|
|
|
c = thisIndex.counter
|
|
if c is not None and c == nextIndex.counter:
|
|
# they belong to the same source, so their relative delta is length
|
|
return nextIndex.relative - thisIndex.relative
|
|
|
|
# FIXME: more logic
|
|
return -1
|
|
|
|
def getRealPath(self, path):
|
|
"""
|
|
Translate the .toc's FILE to an existing path.
|
|
|
|
:param path: path to track
|
|
:type path: str
|
|
"""
|
|
return common.getRealPath(self._path, path)
|
|
|
|
|
|
class File:
|
|
"""Represent a FILE line in a .toc file."""
|
|
|
|
def __init__(self, path, start, length):
|
|
"""
|
|
Init File.
|
|
|
|
:param path: path to track
|
|
:type path: unicode
|
|
:param start: starting point for the track in this file, in frames
|
|
:type start: int
|
|
:param length: length for the track in this file, in frames
|
|
:type length: int
|
|
"""
|
|
assert isinstance(path, str), "%r is not str" % path
|
|
|
|
self.path = path
|
|
self.start = start
|
|
self.length = length
|
|
|
|
def __repr__(self):
|
|
return '<File %r>' % (self.path, )
|