sointu/tracker/detector.go
2025-04-30 22:00:34 +03:00

424 lines
13 KiB
Go

package tracker
import (
"math"
"github.com/viterin/vek/vek32"
"github.com/vsariola/sointu"
)
type (
Detector struct {
broker *Broker
loudnessDetector loudnessDetector
peakDetector peakDetector
}
WeightingType int
LoudnessType int
PeakType int
Decibel float32
LoudnessResult [NumLoudnessTypes]Decibel
PeakResult [NumPeakTypes][2]Decibel
DetectorResult struct {
Loudness LoudnessResult
Peaks PeakResult
}
loudnessDetector struct {
weighting weighting
states [2][3]biquadState
powers [2]RingBuffer[float32] // 0 = momentary, 1 = short-term
averagedPowers [2][]float32
maxPowers [2]float32
integratedPower float32
tmp, tmp2 []float32
tmpbool []bool
}
biquadState struct {
x1, x2, y1, y2 float32
}
biquadCoeff struct {
b0, b1, b2, a1, a2 float32
}
weighting []biquadCoeff
peakDetector struct {
oversampling bool
states [2]oversamplerState
windows [2][2]RingBuffer[float32]
maxPower [2]float32
tmp, tmp2 []float32
}
oversamplerState struct {
history [11]float32
tmp, tmp2 []float32
}
)
const (
LoudnessMomentary LoudnessType = iota
LoudnessShortTerm
LoudnessMaxMomentary
LoudnessMaxShortTerm
LoudnessIntegrated
NumLoudnessTypes
)
const MAX_INTEGRATED_DATA = 10 * 60 * 60 // 1 hour of samples at 10 Hz (100 ms per sample)
const (
PeakMomentary PeakType = iota
PeakShortTerm
PeakIntegrated
NumPeakTypes
)
const (
KWeighting WeightingType = iota
AWeighting
CWeighting
NoWeighting
NumWeightingTypes
)
func NewDetector(b *Broker) *Detector {
return &Detector{
broker: b,
loudnessDetector: makeLoudnessDetector(KWeighting),
peakDetector: makePeakDetector(true),
}
}
func (s *Detector) Run() {
var chunkHistory sointu.AudioBuffer
for msg := range s.broker.ToDetector {
if msg.Reset {
s.loudnessDetector.reset()
s.peakDetector.reset()
}
if msg.Quit {
return
}
if msg.HasWeightingType {
s.loudnessDetector.weighting = weightings[WeightingType(msg.WeightingType)]
s.loudnessDetector.reset()
}
if msg.HasOversampling {
s.peakDetector.oversampling = msg.Oversampling
s.peakDetector.reset()
}
switch data := msg.Data.(type) {
case *sointu.AudioBuffer:
buf := *data
for {
var chunk sointu.AudioBuffer
if len(chunkHistory) > 0 && len(chunkHistory) < 4410 {
l := min(len(buf), 4410-len(chunkHistory))
chunkHistory = append(chunkHistory, buf[:l]...)
if len(chunkHistory) < 4410 {
break
}
chunk = chunkHistory
buf = buf[l:]
} else {
if len(buf) >= 4410 {
chunk = buf[:4410]
buf = buf[4410:]
} else {
chunkHistory = chunkHistory[:0]
chunkHistory = append(chunkHistory, buf...)
break
}
}
TrySend(s.broker.ToModel, MsgToModel{
HasDetectorResult: true,
DetectorResult: DetectorResult{
Loudness: s.loudnessDetector.update(chunk),
Peaks: s.peakDetector.update(chunk),
},
})
}
s.broker.PutAudioBuffer(data)
case func():
data()
}
}
}
// Close may theoretically block if the broker is full, but it should not happen in practice
func (s *Detector) Close() {
s.broker.ToDetector <- MsgToDetector{Quit: true}
}
func makeLoudnessDetector(weighting WeightingType) loudnessDetector {
return loudnessDetector{
weighting: weightings[weighting],
powers: [2]RingBuffer[float32]{
{Buffer: make([]float32, 4)}, // momentary loudness
{Buffer: make([]float32, 30)}, // short-term loudness
},
}
}
func makePeakDetector(oversampling bool) peakDetector {
return peakDetector{
oversampling: oversampling,
windows: [2][2]RingBuffer[float32]{
{{Buffer: make([]float32, 4)}, {Buffer: make([]float32, 4)}}, // momentary peaks
{{Buffer: make([]float32, 30)}, {Buffer: make([]float32, 30)}}, // short-term peaks
},
}
}
/*
From matlab: (we bake in the scale values to the numerator coefficients)
weightings = {'A-weighting','C-weighting','k-weighting'}
for j = 1:3
disp(weightings{j})
f = getFilter(weightingFilter(weightings{j},'SampleRate',44100)); f.Numerator, f.Denominator, f.ScaleValues
if j == 3 % k-weighting has non-zero gain at 1 kHz, so normalize it to 0 dB by scaling the first filter
[h,w] = freqz(f,[1000,1000],44100);
g = abs(h(1));
fprintf("Gain %f dB\n", 20*log10(abs(h(1))));
f.Numerator(1,:) = f.Numerator(1,:)/g;
end
for i = 1:size(f.Numerator,1); fprintf("b0: %.16f, b1: %.16f, b2: %.16f, a1: %.16f, a2: %.16f\n",f.Numerator(i,:)*f.ScaleValues(i),f.Denominator(i,2:end)); end
end
*/
var weightings = map[WeightingType]weighting{
AWeighting: {
{b0: 0.2556115104436430, b1: 0.5112230208872860, b2: 0.2556115104436430, a1: -0.1405360824207108, a2: 0.0049375976155402},
{b0: 1, b1: -2, b2: 1, a1: -1.8849012174287920, a2: 0.8864214718161675},
{b0: 1, b1: -2, b2: 1, a1: -1.9941388812663283, a2: 0.9941474694445309},
},
CWeighting: {
{b0: 0.2170124955461332, b1: 0.4340249910922664, b2: 0.2170124955461332, a1: -0.1405360824207108, a2: 0.0049375976155402},
{b0: 1, b1: -2, b2: 1, a1: -1.9941388812663283, a2: 0.9941474694445309},
},
KWeighting: {
{b0: 1.4128568659906546, b1: -2.4466647580657646, b2: 1.0789762991286349, a1: -1.6636551132560204, a2: 0.7125954280732254},
{b0: 0.9995600645425144, b1: -1.9991201290850289, b2: 0.9995600645425144, a1: -1.9891696736297957, a2: 0.9891990357870394},
},
NoWeighting: {},
}
// according to https://tech.ebu.ch/docs/tech/tech3341.pdf
// we have two sliding windows: momentary loudness = last 400 ms, short-term loudness = last 3 s
// display:
//
// momentary loudness = last analyzed 400 ms blcok
// short-term loudness = last analyzed 3 s block
//
// every 100 ms, we collect one data point of the momentary loudness (starting to play song again resets the data blocks)
// then:
//
// integrated loudness = the blocks are gated, and the average loudness of the gated blocks is calculated
// maximum momentary loudness = maximum of all the momentary blocks
// maximum short-term loudness = maximum of all the short-term blocks
func (d *loudnessDetector) update(chunk sointu.AudioBuffer) LoudnessResult {
l := max(len(chunk), MAX_INTEGRATED_DATA)
setSliceLength(&d.tmp, l)
setSliceLength(&d.tmp2, l)
setSliceLength(&d.tmpbool, l)
var total float32
for chn := range 2 {
// deinterleave the channels
for i := range chunk {
d.tmp[i] = chunk[i][chn]
}
// filter the signal with the weighting filter
for k := range d.weighting {
d.states[chn][k].Filter(d.tmp[:len(chunk)], d.weighting[k])
}
// square the samples
res := vek32.Mul_Into(d.tmp2, d.tmp[:len(chunk)], d.tmp[:len(chunk)])
// calculate the mean and add it to the total
total += vek32.Mean(res)
}
var ret [NumLoudnessTypes]Decibel
for i := range d.powers {
d.powers[i].WriteWrapSingle(total) // these are sliding windows of 4 and 30 power measurements (400 ms and 3 s aka momentary and short-term windows)
mean := vek32.Mean(d.powers[i].Buffer)
if len(d.averagedPowers[i]) < MAX_INTEGRATED_DATA { // we need to have some limit on how much data we keep
d.averagedPowers[i] = append(d.averagedPowers[i], mean)
}
if d.maxPowers[i] < mean {
d.maxPowers[i] = mean
}
ret[i+int(LoudnessMomentary)] = powerToDecibel(mean) // we assume the LoudnessMomentary is followed by LoudnessShortTerm
ret[i+int(LoudnessMaxMomentary)] = powerToDecibel(d.maxPowers[i])
}
if len(d.averagedPowers[0])%10 == 0 { // every 10 samples of 100 ms i.e. every 1 s, we recalculate the integrated power
absThreshold := decibelToPower(-70) // -70 dB is the first threshold
b := vek32.GtNumber_Into(d.tmpbool, d.averagedPowers[0], absThreshold)
m2 := vek32.Select_Into(d.tmp, d.averagedPowers[0], b)
if len(m2) > 0 {
relThreshold := vek32.Mean(m2) / 10 // the relative threshold is 10 dB below the mean of the values above the absolute threshold
b2 := vek32.GtNumber_Into(d.tmpbool, m2, relThreshold)
m3 := vek32.Select_Into(d.tmp2, m2, b2)
if len(m3) > 0 {
d.integratedPower = vek32.Mean(m3)
}
}
}
ret[LoudnessIntegrated] = powerToDecibel(d.integratedPower)
return ret
}
func (d *loudnessDetector) reset() {
for i := range d.powers {
d.powers[i].Cursor = 0
l := len(d.powers[i].Buffer)
d.powers[i].Buffer = d.powers[i].Buffer[:0]
d.powers[i].Buffer = append(d.powers[i].Buffer, make([]float32, l)...)
d.averagedPowers[i] = d.averagedPowers[i][:0]
d.maxPowers[i] = 0
}
// reset the biquad states
d.states = [2][3]biquadState{}
d.integratedPower = 0
}
func powerToDecibel(power float32) Decibel {
return Decibel(float32(10 * math.Log10(float64(power))))
}
func amplitudeToDecibel(amplitude float32) Decibel {
return Decibel(float32(20 * math.Log10(float64(amplitude))))
}
func decibelToPower(loudness Decibel) float32 {
return (float32)(math.Pow(10, (float64(loudness))/10))
}
func (state *biquadState) Filter(buffer []float32, coeff biquadCoeff) {
s := *state
for i := range buffer {
x := buffer[i]
y := coeff.b0*x + coeff.b1*s.x1 + coeff.b2*s.x2 - coeff.a1*s.y1 - coeff.a2*s.y2
s.x2, s.x1 = s.x1, x
s.y2, s.y1 = s.y1, y
buffer[i] = y
}
*state = s
}
func setSliceLength[T any](slice *[]T, length int) {
if len(*slice) < length {
*slice = append(*slice, make([]T, length-len(*slice))...)
}
*slice = (*slice)[:length]
}
// ref: https://www.itu.int/dms_pubrec/itu-r/rec/bs/R-REC-BS.1770-5-202311-I!!PDF-E.pdf
var oversamplingCoeffs = [4][12]float32{
{0.0017089843750, 0.0109863281250, -0.0196533203125, 0.0332031250000, -0.0594482421875, 0.1373291015625, 0.9721679687500, -0.1022949218750, 0.0476074218750, -0.0266113281250, 0.0148925781250, -0.0083007812500},
{-0.0291748046875, 0.0292968750000, -0.0517578125000, 0.0891113281250, -0.1665039062500, 0.4650878906250, 0.7797851562500, -0.2003173828125, 0.1015625000000, -0.0582275390625, 0.0330810546875, -0.0189208984375},
{-0.0189208984375, 0.0330810546875, -0.058227539062, 0.1015625000000, -0.200317382812, 0.7797851562500, 0.4650878906250, -0.166503906250, 0.0891113281250, -0.051757812500, 0.0292968750000, -0.0291748046875},
{-0.0083007812500, 0.0148925781250, -0.0266113281250, 0.0476074218750, -0.1022949218750, 0.9721679687500, 0.1373291015625, -0.0594482421875, 0.0332031250000, -0.0196533203125, 0.0109863281250, 0.0017089843750},
}
// u[k] = x[k/4] if k%4 == 0, 0 otherwise
// y[k] = sum_{i=0}^{47} h[i] * u[k-i]
// h[i] = o[i%4][i/4]
// k = p*4+q, q=0..3
// y[p*4+q] = sum_{j=0}^{11} sum_{i=0}^{3} h[j*4+i] * u[p*4+q-j*4-i] = ...
// (q-i)%4 == 0 ==> i = q
// ... = sum_{j=0}^{11} o[q][j] * x[p-j]
// y should be at least 4 times the length of x
func (s *oversamplerState) Oversample(x []float32, y []float32) []float32 {
if len(s.tmp) < len(x) {
s.tmp = append(s.tmp, make([]float32, len(x)-len(s.tmp))...)
}
if len(s.tmp2) < len(x) {
s.tmp2 = append(s.tmp2, make([]float32, len(x)-len(s.tmp2))...)
}
for q, coeffs := range oversamplingCoeffs {
// tmp2 will be conv(o[q],x)
r := vek32.Zeros_Into(s.tmp2, len(x))
for j, c := range coeffs {
vek32.MulNumber_Into(s.tmp[:j], s.history[11-j:11], c) // convolution might pull values before x[0], so we need to use history for that
vek32.MulNumber_Into(s.tmp[j:], x[:len(x)-j], c)
vek32.Add_Inplace(r, s.tmp[:len(x)])
}
// interleave the phases
for p, v := range r {
y[p*4+q] = v
}
}
z := min(len(x), 11)
copy(s.history[:11-z], s.history[z:11])
copy(s.history[11-z:], x[len(x)-z:])
return y[:len(x)*4]
}
// we should perform the peak detection also momentary (last 400 ms), short term
// (last 3 s), and integrated (whole song) for display purposes, we can use
// always last arrived data for the integrated peak, we can use the maximum of
// all the peaks so far (there is no need show "maximum short term true peak" or
// "maximum momentary true peak" because they are same as the maximum for entire song)
//
// display:
//
// momentary true peak
// short-term true peak
// integrated true peak
func (d *peakDetector) update(buf sointu.AudioBuffer) (ret PeakResult) {
if len(d.tmp) < len(buf) {
d.tmp = append(d.tmp, make([]float32, len(buf)-len(d.tmp))...)
}
len4 := 4 * len(buf)
if len(d.tmp2) < len4 {
d.tmp2 = append(d.tmp2, make([]float32, len4-len(d.tmp2))...)
}
for chn := range 2 {
// deinterleave the channels
for i := range buf {
d.tmp[i] = buf[i][chn]
}
// 4x oversample the signal
var o []float32
if d.oversampling {
o = d.states[chn].Oversample(d.tmp[:len(buf)], d.tmp2)
} else {
o = d.tmp[:len(buf)]
}
// take absolute value of the oversampled signal
vek32.Abs_Inplace(o)
p := vek32.Max(o)
// find the maximum value in the window
for i := range d.windows {
d.windows[i][chn].WriteWrapSingle(p)
windowPeak := vek32.Max(d.windows[i][chn].Buffer)
ret[i+int(PeakMomentary)][chn] = amplitudeToDecibel(windowPeak)
}
if d.maxPower[chn] < p {
d.maxPower[chn] = p
}
ret[int(PeakIntegrated)][chn] = amplitudeToDecibel(d.maxPower[chn])
}
return
}
func (d *peakDetector) reset() {
for chn := range 2 {
d.states[chn].history = [11]float32{}
for i := range d.windows[chn] {
d.windows[i][chn].Cursor = 0
l := len(d.windows[i][chn].Buffer)
d.windows[i][chn].Buffer = d.windows[i][chn].Buffer[:0]
d.windows[i][chn].Buffer = append(d.windows[i][chn].Buffer, make([]float32, l)...)
}
d.maxPower[chn] = 0
}
}