sointu/tracker/detector.go

package tracker

import (
	"math"

	"github.com/viterin/vek/vek32"
	"github.com/vsariola/sointu"
)

type (
	Detector struct {
		broker           *Broker
		loudnessDetector loudnessDetector
		peakDetector     peakDetector
	}

	WeightingType int
	LoudnessType  int
	PeakType      int

	Decibel float32

	LoudnessResult [NumLoudnessTypes]Decibel
	PeakResult     [NumPeakTypes][2]Decibel

	DetectorResult struct {
		Loudness LoudnessResult
		Peaks    PeakResult
	}

	loudnessDetector struct {
		weighting       weighting
		states          [2][3]biquadState
		powers          [2]RingBuffer[float32] // 0 = momentary, 1 = short-term
		averagedPowers  [2][]float32
		maxPowers       [2]float32
		integratedPower float32
		tmp, tmp2       []float32
		tmpbool         []bool
	}

	biquadState struct {
		x1, x2, y1, y2 float32
	}

	biquadCoeff struct {
		b0, b1, b2, a1, a2 float32
	}

	weighting []biquadCoeff

	peakDetector struct {
		oversampling bool
		states       [2]oversamplerState
		windows      [2][2]RingBuffer[float32]
		maxPower     [2]float32
		tmp, tmp2    []float32
	}

	oversamplerState struct {
		history   [11]float32
		tmp, tmp2 []float32
	}
)

const (
	LoudnessMomentary LoudnessType = iota
	LoudnessShortTerm
	LoudnessMaxMomentary
	LoudnessMaxShortTerm
	LoudnessIntegrated
	NumLoudnessTypes
)

const MAX_INTEGRATED_DATA = 10 * 60 * 60 // 1 hour of samples at 10 Hz (100 ms per sample)

const (
	PeakMomentary PeakType = iota
	PeakShortTerm
	PeakIntegrated
	NumPeakTypes
)

const (
	KWeighting WeightingType = iota
	AWeighting
	CWeighting
	NoWeighting
	NumWeightingTypes
)

func NewDetector(b *Broker) *Detector {
	return &Detector{
		broker:           b,
		loudnessDetector: makeLoudnessDetector(KWeighting),
		peakDetector:     makePeakDetector(true),
	}
}

func (s *Detector) Run() {
	var chunkHistory sointu.AudioBuffer
	for msg := range s.broker.ToDetector {
		if msg.Reset {
			s.loudnessDetector.reset()
			s.peakDetector.reset()
		}
		if msg.Quit {
			return
		}
		if msg.HasWeightingType {
			s.loudnessDetector.weighting = weightings[WeightingType(msg.WeightingType)]
			s.loudnessDetector.reset()
		}
		if msg.HasOversampling {
			s.peakDetector.oversampling = msg.Oversampling
			s.peakDetector.reset()
		}

		switch data := msg.Data.(type) {
		case *sointu.AudioBuffer:
			buf := *data
			for {
				var chunk sointu.AudioBuffer
				if len(chunkHistory) > 0 && len(chunkHistory) < 4410 {
					l := min(len(buf), 4410-len(chunkHistory))
					chunkHistory = append(chunkHistory, buf[:l]...)
					if len(chunkHistory) < 4410 {
						break
					}
					chunk = chunkHistory
					buf = buf[l:]
				} else {
					if len(buf) >= 4410 {
						chunk = buf[:4410]
						buf = buf[4410:]
					} else {
						chunkHistory = chunkHistory[:0]
						chunkHistory = append(chunkHistory, buf...)
						break
					}
				}
				TrySend(s.broker.ToModel, MsgToModel{
					HasDetectorResult: true,
					DetectorResult: DetectorResult{
						Loudness: s.loudnessDetector.update(chunk),
						Peaks:    s.peakDetector.update(chunk),
					},
				})
			}
			s.broker.PutAudioBuffer(data)
		case func():
			data()
		}
	}
}

// Close may theoretically block if the broker is full, but it should not happen in practice
func (s *Detector) Close() {
	s.broker.ToDetector <- MsgToDetector{Quit: true}
}

func makeLoudnessDetector(weighting WeightingType) loudnessDetector {
	return loudnessDetector{
		weighting: weightings[weighting],
		powers: [2]RingBuffer[float32]{
			{Buffer: make([]float32, 4)},  // momentary loudness
			{Buffer: make([]float32, 30)}, // short-term loudness
		},
	}
}

func makePeakDetector(oversampling bool) peakDetector {
	return peakDetector{
		oversampling: oversampling,
		windows: [2][2]RingBuffer[float32]{
			{{Buffer: make([]float32, 4)}, {Buffer: make([]float32, 4)}},   // momentary peaks
			{{Buffer: make([]float32, 30)}, {Buffer: make([]float32, 30)}}, // short-term peaks
		},
	}
}

/*
From matlab: (we bake in the scale values to the numerator coefficients)
weightings = {'A-weighting','C-weighting','k-weighting'}
for j = 1:3
disp(weightings{j})
f = getFilter(weightingFilter(weightings{j},'SampleRate',44100)); f.Numerator, f.Denominator, f.ScaleValues
if j == 3 % k-weighting has non-zero gain at 1 kHz, so normalize it to 0 dB by scaling the first filter
[h,w] = freqz(f,[1000,1000],44100);
g = abs(h(1));
fprintf("Gain %f dB\n", 20*log10(abs(h(1))));
f.Numerator(1,:) = f.Numerator(1,:)/g;
end
for i = 1:size(f.Numerator,1); fprintf("b0: %.16f, b1: %.16f, b2: %.16f, a1: %.16f, a2: %.16f\n",f.Numerator(i,:)*f.ScaleValues(i),f.Denominator(i,2:end)); end
end
*/
var weightings = map[WeightingType]weighting{
	AWeighting: {
		{b0: 0.2556115104436430, b1: 0.5112230208872860, b2: 0.2556115104436430, a1: -0.1405360824207108, a2: 0.0049375976155402},
		{b0: 1, b1: -2, b2: 1, a1: -1.8849012174287920, a2: 0.8864214718161675},
		{b0: 1, b1: -2, b2: 1, a1: -1.9941388812663283, a2: 0.9941474694445309},
	},
	CWeighting: {
		{b0: 0.2170124955461332, b1: 0.4340249910922664, b2: 0.2170124955461332, a1: -0.1405360824207108, a2: 0.0049375976155402},
		{b0: 1, b1: -2, b2: 1, a1: -1.9941388812663283, a2: 0.9941474694445309},
	},
	KWeighting: {
		{b0: 1.4128568659906546, b1: -2.4466647580657646, b2: 1.0789762991286349, a1: -1.6636551132560204, a2: 0.7125954280732254},
		{b0: 0.9995600645425144, b1: -1.9991201290850289, b2: 0.9995600645425144, a1: -1.9891696736297957, a2: 0.9891990357870394},
	},
	NoWeighting: {},
}

// according to https://tech.ebu.ch/docs/tech/tech3341.pdf
// we have two sliding windows: momentary loudness = last 400 ms, short-term loudness = last 3 s
// display:
//
//	momentary loudness = last analyzed 400 ms blcok
//	short-term loudness = last analyzed 3 s block
//
// every 100 ms, we collect one data point of the momentary loudness (starting to play song again resets the data blocks)
// then:
//
//	integrated loudness = the blocks are gated, and the average loudness of the gated blocks is calculated
//	maximum momentary loudness = maximum of all the momentary blocks
//	maximum short-term loudness = maximum of all the short-term blocks
func (d *loudnessDetector) update(chunk sointu.AudioBuffer) LoudnessResult {
	l := max(len(chunk), MAX_INTEGRATED_DATA)
	setSliceLength(&d.tmp, l)
	setSliceLength(&d.tmp2, l)
	setSliceLength(&d.tmpbool, l)
	var total float32
	for chn := range 2 {
		// deinterleave the channels
		for i := range chunk {
			d.tmp[i] = chunk[i][chn]
		}
		// filter the signal with the weighting filter
		for k := range d.weighting {
			d.states[chn][k].Filter(d.tmp[:len(chunk)], d.weighting[k])
		}
		// square the samples
		res := vek32.Mul_Into(d.tmp2, d.tmp[:len(chunk)], d.tmp[:len(chunk)])
		// calculate the mean and add it to the total
		total += vek32.Mean(res)
	}
	var ret [NumLoudnessTypes]Decibel
	for i := range d.powers {
		d.powers[i].WriteWrapSingle(total) // these are sliding windows of 4 and 30 power measurements (400 ms and 3 s aka momentary and short-term windows)
		mean := vek32.Mean(d.powers[i].Buffer)
		if len(d.averagedPowers[i]) < MAX_INTEGRATED_DATA { // we need to have some limit on how much data we keep
			d.averagedPowers[i] = append(d.averagedPowers[i], mean)
		}
		if d.maxPowers[i] < mean {
			d.maxPowers[i] = mean
		}
		ret[i+int(LoudnessMomentary)] = powerToDecibel(mean) // we assume the LoudnessMomentary is followed by LoudnessShortTerm
		ret[i+int(LoudnessMaxMomentary)] = powerToDecibel(d.maxPowers[i])
	}
	if len(d.averagedPowers[0])%10 == 0 { // every 10 samples of 100 ms i.e. every 1 s, we recalculate the integrated power
		absThreshold := decibelToPower(-70) // -70 dB is the first threshold
		b := vek32.GtNumber_Into(d.tmpbool, d.averagedPowers[0], absThreshold)
		m2 := vek32.Select_Into(d.tmp, d.averagedPowers[0], b)
		if len(m2) > 0 {
			relThreshold := vek32.Mean(m2) / 10 // the relative threshold is 10 dB below the mean of the values above the absolute threshold
			b2 := vek32.GtNumber_Into(d.tmpbool, m2, relThreshold)
			m3 := vek32.Select_Into(d.tmp2, m2, b2)
			if len(m3) > 0 {
				d.integratedPower = vek32.Mean(m3)
			}
		}
	}
	ret[LoudnessIntegrated] = powerToDecibel(d.integratedPower)
	return ret
}

func (d *loudnessDetector) reset() {
	for i := range d.powers {
		d.powers[i].Cursor = 0
		l := len(d.powers[i].Buffer)
		d.powers[i].Buffer = d.powers[i].Buffer[:0]
		d.powers[i].Buffer = append(d.powers[i].Buffer, make([]float32, l)...)
		d.averagedPowers[i] = d.averagedPowers[i][:0]
		d.maxPowers[i] = 0
	}
	// reset the biquad states
	d.states = [2][3]biquadState{}
	d.integratedPower = 0
}

func powerToDecibel(power float32) Decibel {
	return Decibel(float32(10 * math.Log10(float64(power))))
}

func amplitudeToDecibel(amplitude float32) Decibel {
	return Decibel(float32(20 * math.Log10(float64(amplitude))))
}

func decibelToPower(loudness Decibel) float32 {
	return (float32)(math.Pow(10, (float64(loudness))/10))
}

func (state *biquadState) Filter(buffer []float32, coeff biquadCoeff) {
	s := *state
	for i := range buffer {
		x := buffer[i]
		y := coeff.b0*x + coeff.b1*s.x1 + coeff.b2*s.x2 - coeff.a1*s.y1 - coeff.a2*s.y2
		s.x2, s.x1 = s.x1, x
		s.y2, s.y1 = s.y1, y
		buffer[i] = y
	}
	*state = s
}

func setSliceLength[T any](slice *[]T, length int) {
	if len(*slice) < length {
		*slice = append(*slice, make([]T, length-len(*slice))...)
	}
	*slice = (*slice)[:length]
}

// ref: https://www.itu.int/dms_pubrec/itu-r/rec/bs/R-REC-BS.1770-5-202311-I!!PDF-E.pdf
var oversamplingCoeffs = [4][12]float32{
	{0.0017089843750, 0.0109863281250, -0.0196533203125, 0.0332031250000, -0.0594482421875, 0.1373291015625, 0.9721679687500, -0.1022949218750, 0.0476074218750, -0.0266113281250, 0.0148925781250, -0.0083007812500},
	{-0.0291748046875, 0.0292968750000, -0.0517578125000, 0.0891113281250, -0.1665039062500, 0.4650878906250, 0.7797851562500, -0.2003173828125, 0.1015625000000, -0.0582275390625, 0.0330810546875, -0.0189208984375},
	{-0.0189208984375, 0.0330810546875, -0.058227539062, 0.1015625000000, -0.200317382812, 0.7797851562500, 0.4650878906250, -0.166503906250, 0.0891113281250, -0.051757812500, 0.0292968750000, -0.0291748046875},
	{-0.0083007812500, 0.0148925781250, -0.0266113281250, 0.0476074218750, -0.1022949218750, 0.9721679687500, 0.1373291015625, -0.0594482421875, 0.0332031250000, -0.0196533203125, 0.0109863281250, 0.0017089843750},
}

// u[k] = x[k/4] if k%4 == 0, 0 otherwise
// y[k] = sum_{i=0}^{47} h[i] * u[k-i]
// h[i] = o[i%4][i/4]
// k = p*4+q, q=0..3
// y[p*4+q] = sum_{j=0}^{11} sum_{i=0}^{3} h[j*4+i] * u[p*4+q-j*4-i] = ...
// (q-i)%4 == 0 ==> i = q
// ... = sum_{j=0}^{11} o[q][j] * x[p-j]
// y should be at least 4 times the length of x
func (s *oversamplerState) Oversample(x []float32, y []float32) []float32 {
	if len(s.tmp) < len(x) {
		s.tmp = append(s.tmp, make([]float32, len(x)-len(s.tmp))...)
	}
	if len(s.tmp2) < len(x) {
		s.tmp2 = append(s.tmp2, make([]float32, len(x)-len(s.tmp2))...)
	}
	for q, coeffs := range oversamplingCoeffs {
		// tmp2 will be conv(o[q],x)
		r := vek32.Zeros_Into(s.tmp2, len(x))
		for j, c := range coeffs {
			vek32.MulNumber_Into(s.tmp[:j], s.history[11-j:11], c) // convolution might pull values before x[0], so we need to use history for that
			vek32.MulNumber_Into(s.tmp[j:], x[:len(x)-j], c)
			vek32.Add_Inplace(r, s.tmp[:len(x)])
		}
		// interleave the phases
		for p, v := range r {
			y[p*4+q] = v
		}
	}
	z := min(len(x), 11)
	copy(s.history[:11-z], s.history[z:11])
	copy(s.history[11-z:], x[len(x)-z:])
	return y[:len(x)*4]
}

// we should perform the peak detection also momentary (last 400 ms), short term
// (last 3 s), and integrated (whole song) for display purposes, we can use
// always last arrived data for the integrated peak, we can use the maximum of
// all the peaks so far (there is no need show "maximum short term true peak" or
// "maximum momentary true peak" because they are same as the maximum for entire song)
//
// display:
//
//	momentary true peak
//	short-term true peak
//	integrated true peak
func (d *peakDetector) update(buf sointu.AudioBuffer) (ret PeakResult) {
	if len(d.tmp) < len(buf) {
		d.tmp = append(d.tmp, make([]float32, len(buf)-len(d.tmp))...)
	}
	len4 := 4 * len(buf)
	if len(d.tmp2) < len4 {
		d.tmp2 = append(d.tmp2, make([]float32, len4-len(d.tmp2))...)
	}
	for chn := range 2 {
		// deinterleave the channels
		for i := range buf {
			d.tmp[i] = buf[i][chn]
		}
		// 4x oversample the signal
		var o []float32
		if d.oversampling {
			o = d.states[chn].Oversample(d.tmp[:len(buf)], d.tmp2)
		} else {
			o = d.tmp[:len(buf)]
		}
		// take absolute value of the oversampled signal
		vek32.Abs_Inplace(o)
		p := vek32.Max(o)
		// find the maximum value in the window
		for i := range d.windows {
			d.windows[i][chn].WriteWrapSingle(p)
			windowPeak := vek32.Max(d.windows[i][chn].Buffer)
			ret[i+int(PeakMomentary)][chn] = amplitudeToDecibel(windowPeak)
		}
		if d.maxPower[chn] < p {
			d.maxPower[chn] = p
		}
		ret[int(PeakIntegrated)][chn] = amplitudeToDecibel(d.maxPower[chn])
	}
	return
}

func (d *peakDetector) reset() {
	for chn := range 2 {
		d.states[chn].history = [11]float32{}
		for i := range d.windows[chn] {
			d.windows[i][chn].Cursor = 0
			l := len(d.windows[i][chn].Buffer)
			d.windows[i][chn].Buffer = d.windows[i][chn].Buffer[:0]
			d.windows[i][chn].Buffer = append(d.windows[i][chn].Buffer, make([]float32, l)...)
		}
		d.maxPower[chn] = 0
	}
}