VampPluginSDK
2.1
|
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ 00002 00003 /* 00004 Vamp 00005 00006 An API for audio analysis and feature extraction plugins. 00007 00008 Centre for Digital Music, Queen Mary, University of London. 00009 Copyright 2006-2009 Chris Cannam and QMUL. 00010 00011 Permission is hereby granted, free of charge, to any person 00012 obtaining a copy of this software and associated documentation 00013 files (the "Software"), to deal in the Software without 00014 restriction, including without limitation the rights to use, copy, 00015 modify, merge, publish, distribute, sublicense, and/or sell copies 00016 of the Software, and to permit persons to whom the Software is 00017 furnished to do so, subject to the following conditions: 00018 00019 The above copyright notice and this permission notice shall be 00020 included in all copies or substantial portions of the Software. 00021 00022 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00023 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00024 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00025 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR 00026 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 00027 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00028 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00029 00030 Except as contained in this notice, the names of the Centre for 00031 Digital Music; Queen Mary, University of London; and Chris Cannam 00032 shall not be used in advertising or otherwise to promote the sale, 00033 use or other dealings in this Software without prior written 00034 authorization. 00035 */ 00036 00037 #include "FixedTempoEstimator.h" 00038 00039 using std::string; 00040 using std::vector; 00041 using std::cerr; 00042 using std::endl; 00043 00044 using Vamp::RealTime; 00045 00046 #include <cmath> 00047 #include <cstdio> 00048 00049 class FixedTempoEstimator::D 00050 // this class just avoids us having to declare any data members in the header 00051 { 00052 public: 00053 D(float inputSampleRate); 00054 ~D(); 00055 00056 size_t getPreferredStepSize() const { return 64; } 00057 size_t getPreferredBlockSize() const { return 256; } 00058 00059 ParameterList getParameterDescriptors() const; 00060 float getParameter(string id) const; 00061 void setParameter(string id, float value); 00062 00063 OutputList getOutputDescriptors() const; 00064 00065 bool initialise(size_t channels, size_t stepSize, size_t blockSize); 00066 void reset(); 00067 FeatureSet process(const float *const *, RealTime); 00068 FeatureSet getRemainingFeatures(); 00069 00070 private: 00071 void calculate(); 00072 FeatureSet assembleFeatures(); 00073 00074 float lag2tempo(int); 00075 int tempo2lag(float); 00076 00077 float m_inputSampleRate; 00078 size_t m_stepSize; 00079 size_t m_blockSize; 00080 00081 float m_minbpm; 00082 float m_maxbpm; 00083 float m_maxdflen; 00084 00085 float *m_priorMagnitudes; 00086 00087 size_t m_dfsize; 00088 float *m_df; 00089 float *m_r; 00090 float *m_fr; 00091 float *m_t; 00092 size_t m_n; 00093 00094 Vamp::RealTime m_start; 00095 Vamp::RealTime m_lasttime; 00096 }; 00097 00098 FixedTempoEstimator::D::D(float inputSampleRate) : 00099 m_inputSampleRate(inputSampleRate), 00100 m_stepSize(0), 00101 m_blockSize(0), 00102 m_minbpm(50), 00103 m_maxbpm(190), 00104 m_maxdflen(10), 00105 m_priorMagnitudes(0), 00106 m_df(0), 00107 m_r(0), 00108 m_fr(0), 00109 m_t(0), 00110 m_n(0) 00111 { 00112 } 00113 00114 FixedTempoEstimator::D::~D() 00115 { 00116 delete[] m_priorMagnitudes; 00117 delete[] m_df; 00118 delete[] m_r; 00119 delete[] m_fr; 00120 delete[] m_t; 00121 } 00122 00123 FixedTempoEstimator::ParameterList 00124 FixedTempoEstimator::D::getParameterDescriptors() const 00125 { 00126 ParameterList list; 00127 00128 ParameterDescriptor d; 00129 d.identifier = "minbpm"; 00130 d.name = "Minimum estimated tempo"; 00131 d.description = "Minimum beat-per-minute value which the tempo estimator is able to return"; 00132 d.unit = "bpm"; 00133 d.minValue = 10; 00134 d.maxValue = 360; 00135 d.defaultValue = 50; 00136 d.isQuantized = false; 00137 list.push_back(d); 00138 00139 d.identifier = "maxbpm"; 00140 d.name = "Maximum estimated tempo"; 00141 d.description = "Maximum beat-per-minute value which the tempo estimator is able to return"; 00142 d.defaultValue = 190; 00143 list.push_back(d); 00144 00145 d.identifier = "maxdflen"; 00146 d.name = "Input duration to study"; 00147 d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo. There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio. The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results."; 00148 d.unit = "s"; 00149 d.minValue = 2; 00150 d.maxValue = 40; 00151 d.defaultValue = 10; 00152 list.push_back(d); 00153 00154 return list; 00155 } 00156 00157 float 00158 FixedTempoEstimator::D::getParameter(string id) const 00159 { 00160 if (id == "minbpm") { 00161 return m_minbpm; 00162 } else if (id == "maxbpm") { 00163 return m_maxbpm; 00164 } else if (id == "maxdflen") { 00165 return m_maxdflen; 00166 } 00167 return 0.f; 00168 } 00169 00170 void 00171 FixedTempoEstimator::D::setParameter(string id, float value) 00172 { 00173 if (id == "minbpm") { 00174 m_minbpm = value; 00175 } else if (id == "maxbpm") { 00176 m_maxbpm = value; 00177 } else if (id == "maxdflen") { 00178 m_maxdflen = value; 00179 } 00180 } 00181 00182 static int TempoOutput = 0; 00183 static int CandidatesOutput = 1; 00184 static int DFOutput = 2; 00185 static int ACFOutput = 3; 00186 static int FilteredACFOutput = 4; 00187 00188 FixedTempoEstimator::OutputList 00189 FixedTempoEstimator::D::getOutputDescriptors() const 00190 { 00191 OutputList list; 00192 00193 OutputDescriptor d; 00194 d.identifier = "tempo"; 00195 d.name = "Tempo"; 00196 d.description = "Estimated tempo"; 00197 d.unit = "bpm"; 00198 d.hasFixedBinCount = true; 00199 d.binCount = 1; 00200 d.hasKnownExtents = false; 00201 d.isQuantized = false; 00202 d.sampleType = OutputDescriptor::VariableSampleRate; 00203 d.sampleRate = m_inputSampleRate; 00204 d.hasDuration = true; // our returned tempo spans a certain range 00205 list.push_back(d); 00206 00207 d.identifier = "candidates"; 00208 d.name = "Tempo candidates"; 00209 d.description = "Possible tempo estimates, one per bin with the most likely in the first bin"; 00210 d.unit = "bpm"; 00211 d.hasFixedBinCount = false; 00212 list.push_back(d); 00213 00214 d.identifier = "detectionfunction"; 00215 d.name = "Detection Function"; 00216 d.description = "Onset detection function"; 00217 d.unit = ""; 00218 d.hasFixedBinCount = 1; 00219 d.binCount = 1; 00220 d.hasKnownExtents = true; 00221 d.minValue = 0.0; 00222 d.maxValue = 1.0; 00223 d.isQuantized = false; 00224 d.quantizeStep = 0.0; 00225 d.sampleType = OutputDescriptor::FixedSampleRate; 00226 if (m_stepSize) { 00227 d.sampleRate = m_inputSampleRate / m_stepSize; 00228 } else { 00229 d.sampleRate = m_inputSampleRate / (getPreferredBlockSize()/2); 00230 } 00231 d.hasDuration = false; 00232 list.push_back(d); 00233 00234 d.identifier = "acf"; 00235 d.name = "Autocorrelation Function"; 00236 d.description = "Autocorrelation of onset detection function"; 00237 d.hasKnownExtents = false; 00238 d.unit = "r"; 00239 list.push_back(d); 00240 00241 d.identifier = "filtered_acf"; 00242 d.name = "Filtered Autocorrelation"; 00243 d.description = "Filtered autocorrelation of onset detection function"; 00244 d.unit = "r"; 00245 list.push_back(d); 00246 00247 return list; 00248 } 00249 00250 bool 00251 FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize) 00252 { 00253 m_stepSize = stepSize; 00254 m_blockSize = blockSize; 00255 00256 float dfLengthSecs = m_maxdflen; 00257 m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize; 00258 00259 m_priorMagnitudes = new float[m_blockSize/2]; 00260 m_df = new float[m_dfsize]; 00261 00262 for (size_t i = 0; i < m_blockSize/2; ++i) { 00263 m_priorMagnitudes[i] = 0.f; 00264 } 00265 for (size_t i = 0; i < m_dfsize; ++i) { 00266 m_df[i] = 0.f; 00267 } 00268 00269 m_n = 0; 00270 00271 return true; 00272 } 00273 00274 void 00275 FixedTempoEstimator::D::reset() 00276 { 00277 if (!m_priorMagnitudes) return; 00278 00279 for (size_t i = 0; i < m_blockSize/2; ++i) { 00280 m_priorMagnitudes[i] = 0.f; 00281 } 00282 for (size_t i = 0; i < m_dfsize; ++i) { 00283 m_df[i] = 0.f; 00284 } 00285 00286 delete[] m_r; 00287 m_r = 0; 00288 00289 delete[] m_fr; 00290 m_fr = 0; 00291 00292 delete[] m_t; 00293 m_t = 0; 00294 00295 m_n = 0; 00296 00297 m_start = RealTime::zeroTime; 00298 m_lasttime = RealTime::zeroTime; 00299 } 00300 00301 FixedTempoEstimator::FeatureSet 00302 FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts) 00303 { 00304 FeatureSet fs; 00305 00306 if (m_stepSize == 0) { 00307 cerr << "ERROR: FixedTempoEstimator::process: " 00308 << "FixedTempoEstimator has not been initialised" 00309 << endl; 00310 return fs; 00311 } 00312 00313 if (m_n == 0) m_start = ts; 00314 m_lasttime = ts; 00315 00316 if (m_n == m_dfsize) { 00317 // If we have seen enough input, do the estimation and return 00318 calculate(); 00319 fs = assembleFeatures(); 00320 ++m_n; 00321 return fs; 00322 } 00323 00324 // If we have seen more than enough, just discard and return! 00325 if (m_n > m_dfsize) return FeatureSet(); 00326 00327 float value = 0.f; 00328 00329 // m_df will contain an onset detection function based on the rise 00330 // in overall power from one spectral frame to the next -- 00331 // simplistic but reasonably effective for our purposes. 00332 00333 for (size_t i = 1; i < m_blockSize/2; ++i) { 00334 00335 float real = inputBuffers[0][i*2]; 00336 float imag = inputBuffers[0][i*2 + 1]; 00337 00338 float sqrmag = real * real + imag * imag; 00339 value += fabsf(sqrmag - m_priorMagnitudes[i]); 00340 00341 m_priorMagnitudes[i] = sqrmag; 00342 } 00343 00344 m_df[m_n] = value; 00345 00346 ++m_n; 00347 return fs; 00348 } 00349 00350 FixedTempoEstimator::FeatureSet 00351 FixedTempoEstimator::D::getRemainingFeatures() 00352 { 00353 FeatureSet fs; 00354 if (m_n > m_dfsize) return fs; 00355 calculate(); 00356 fs = assembleFeatures(); 00357 ++m_n; 00358 return fs; 00359 } 00360 00361 float 00362 FixedTempoEstimator::D::lag2tempo(int lag) 00363 { 00364 return 60.f / ((lag * m_stepSize) / m_inputSampleRate); 00365 } 00366 00367 int 00368 FixedTempoEstimator::D::tempo2lag(float tempo) 00369 { 00370 return ((60.f / tempo) * m_inputSampleRate) / m_stepSize; 00371 } 00372 00373 void 00374 FixedTempoEstimator::D::calculate() 00375 { 00376 if (m_r) { 00377 cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl; 00378 return; 00379 } 00380 00381 if (m_n < m_dfsize / 9 && 00382 m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second 00383 cerr << "FixedTempoEstimator::calculate: Input is too short" << endl; 00384 return; 00385 } 00386 00387 // This function takes m_df (the detection function array filled 00388 // out in process()) and calculates m_r (the raw autocorrelation) 00389 // and m_fr (the filtered autocorrelation from whose peaks tempo 00390 // estimates will be taken). 00391 00392 int n = m_n; // length of actual df array (m_dfsize is the theoretical max) 00393 00394 m_r = new float[n/2]; // raw autocorrelation 00395 m_fr = new float[n/2]; // filtered autocorrelation 00396 m_t = new float[n/2]; // averaged tempo estimate for each lag value 00397 00398 for (int i = 0; i < n/2; ++i) { 00399 m_r[i] = 0.f; 00400 m_fr[i] = 0.f; 00401 m_t[i] = lag2tempo(i); 00402 } 00403 00404 // Calculate the raw autocorrelation of the detection function 00405 00406 for (int i = 0; i < n/2; ++i) { 00407 00408 for (int j = i; j < n; ++j) { 00409 m_r[i] += m_df[j] * m_df[j - i]; 00410 } 00411 00412 m_r[i] /= n - i - 1; 00413 } 00414 00415 // Filter the autocorrelation and average out the tempo estimates 00416 00417 float related[] = { 0.5, 2, 4, 8 }; 00418 00419 for (int i = 1; i < n/2-1; ++i) { 00420 00421 m_fr[i] = m_r[i]; 00422 00423 int div = 1; 00424 00425 for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) { 00426 00427 // Check for an obvious peak at each metrically related lag 00428 00429 int k0 = int(i * related[j] + 0.5); 00430 00431 if (k0 >= 0 && k0 < int(n/2)) { 00432 00433 int kmax = 0, kmin = 0; 00434 float kvmax = 0, kvmin = 0; 00435 bool have = false; 00436 00437 for (int k = k0 - 1; k <= k0 + 1; ++k) { 00438 00439 if (k < 0 || k >= n/2) continue; 00440 00441 if (!have || (m_r[k] > kvmax)) { kmax = k; kvmax = m_r[k]; } 00442 if (!have || (m_r[k] < kvmin)) { kmin = k; kvmin = m_r[k]; } 00443 00444 have = true; 00445 } 00446 00447 // Boost the original lag according to the strongest 00448 // value found close to this related lag 00449 00450 m_fr[i] += m_r[kmax] / 5; 00451 00452 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) && 00453 (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) && 00454 kvmax > kvmin * 1.05) { 00455 00456 // The strongest value close to the related lag is 00457 // also a pretty good looking peak, so use it to 00458 // improve our tempo estimate for the original lag 00459 00460 m_t[i] = m_t[i] + lag2tempo(kmax) * related[j]; 00461 ++div; 00462 } 00463 } 00464 } 00465 00466 m_t[i] /= div; 00467 00468 // Finally apply a primitive perceptual weighting (to prefer 00469 // tempi of around 120-130) 00470 00471 float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005; 00472 if (weight < 0.f) weight = 0.f; 00473 weight = weight * weight * weight; 00474 00475 m_fr[i] += m_fr[i] * (weight / 3); 00476 } 00477 } 00478 00479 FixedTempoEstimator::FeatureSet 00480 FixedTempoEstimator::D::assembleFeatures() 00481 { 00482 FeatureSet fs; 00483 if (!m_r) return fs; // No autocorrelation: no results 00484 00485 Feature feature; 00486 feature.hasTimestamp = true; 00487 feature.hasDuration = false; 00488 feature.label = ""; 00489 feature.values.clear(); 00490 feature.values.push_back(0.f); 00491 00492 char buffer[40]; 00493 00494 int n = m_n; 00495 00496 for (int i = 0; i < n; ++i) { 00497 00498 // Return the detection function in the DF output 00499 00500 feature.timestamp = m_start + 00501 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00502 feature.values[0] = m_df[i]; 00503 feature.label = ""; 00504 fs[DFOutput].push_back(feature); 00505 } 00506 00507 for (int i = 1; i < n/2; ++i) { 00508 00509 // Return the raw autocorrelation in the ACF output, each 00510 // value labelled according to its corresponding tempo 00511 00512 feature.timestamp = m_start + 00513 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00514 feature.values[0] = m_r[i]; 00515 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 00516 if (i == n/2-1) feature.label = ""; 00517 else feature.label = buffer; 00518 fs[ACFOutput].push_back(feature); 00519 } 00520 00521 float t0 = m_minbpm; // our minimum detected tempo 00522 float t1 = m_maxbpm; // our maximum detected tempo 00523 00524 int p0 = tempo2lag(t1); 00525 int p1 = tempo2lag(t0); 00526 00527 std::map<float, int> candidates; 00528 00529 for (int i = p0; i <= p1 && i+1 < n/2; ++i) { 00530 00531 if (m_fr[i] > m_fr[i-1] && 00532 m_fr[i] > m_fr[i+1]) { 00533 00534 // This is a peak in the filtered autocorrelation: stick 00535 // it into the map from filtered autocorrelation to lag 00536 // index -- this sorts our peaks by filtered acf value 00537 00538 candidates[m_fr[i]] = i; 00539 } 00540 00541 // Also return the filtered autocorrelation in its own output 00542 00543 feature.timestamp = m_start + 00544 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate); 00545 feature.values[0] = m_fr[i]; 00546 sprintf(buffer, "%.1f bpm", lag2tempo(i)); 00547 if (i == p1 || i == n/2-2) feature.label = ""; 00548 else feature.label = buffer; 00549 fs[FilteredACFOutput].push_back(feature); 00550 } 00551 00552 if (candidates.empty()) { 00553 cerr << "No tempo candidates!" << endl; 00554 return fs; 00555 } 00556 00557 feature.hasTimestamp = true; 00558 feature.timestamp = m_start; 00559 00560 feature.hasDuration = true; 00561 feature.duration = m_lasttime - m_start; 00562 00563 // The map contains only peaks and is sorted by filtered acf 00564 // value, so the final element in it is our "best" tempo guess 00565 00566 std::map<float, int>::const_iterator ci = candidates.end(); 00567 --ci; 00568 int maxpi = ci->second; 00569 00570 if (m_t[maxpi] > 0) { 00571 00572 // This lag has an adjusted tempo from the averaging process: 00573 // use it 00574 00575 feature.values[0] = m_t[maxpi]; 00576 00577 } else { 00578 00579 // shouldn't happen -- it would imply that this high value was 00580 // not a peak! 00581 00582 feature.values[0] = lag2tempo(maxpi); 00583 cerr << "WARNING: No stored tempo for index " << maxpi << endl; 00584 } 00585 00586 sprintf(buffer, "%.1f bpm", feature.values[0]); 00587 feature.label = buffer; 00588 00589 // Return the best tempo in the main output 00590 00591 fs[TempoOutput].push_back(feature); 00592 00593 // And return the other estimates (up to the arbitrarily chosen 00594 // number of 10 of them) in the candidates output 00595 00596 feature.values.clear(); 00597 feature.label = ""; 00598 00599 while (feature.values.size() < 10) { 00600 if (m_t[ci->second] > 0) { 00601 feature.values.push_back(m_t[ci->second]); 00602 } else { 00603 feature.values.push_back(lag2tempo(ci->second)); 00604 } 00605 if (ci == candidates.begin()) break; 00606 --ci; 00607 } 00608 00609 fs[CandidatesOutput].push_back(feature); 00610 00611 return fs; 00612 } 00613 00614 00615 00616 FixedTempoEstimator::FixedTempoEstimator(float inputSampleRate) : 00617 Plugin(inputSampleRate), 00618 m_d(new D(inputSampleRate)) 00619 { 00620 } 00621 00622 FixedTempoEstimator::~FixedTempoEstimator() 00623 { 00624 delete m_d; 00625 } 00626 00627 string 00628 FixedTempoEstimator::getIdentifier() const 00629 { 00630 return "fixedtempo"; 00631 } 00632 00633 string 00634 FixedTempoEstimator::getName() const 00635 { 00636 return "Simple Fixed Tempo Estimator"; 00637 } 00638 00639 string 00640 FixedTempoEstimator::getDescription() const 00641 { 00642 return "Study a short section of audio and estimate its tempo, assuming the tempo is constant"; 00643 } 00644 00645 string 00646 FixedTempoEstimator::getMaker() const 00647 { 00648 return "Vamp SDK Example Plugins"; 00649 } 00650 00651 int 00652 FixedTempoEstimator::getPluginVersion() const 00653 { 00654 return 1; 00655 } 00656 00657 string 00658 FixedTempoEstimator::getCopyright() const 00659 { 00660 return "Code copyright 2008 Queen Mary, University of London. Freely redistributable (BSD license)"; 00661 } 00662 00663 size_t 00664 FixedTempoEstimator::getPreferredStepSize() const 00665 { 00666 return m_d->getPreferredStepSize(); 00667 } 00668 00669 size_t 00670 FixedTempoEstimator::getPreferredBlockSize() const 00671 { 00672 return m_d->getPreferredBlockSize(); 00673 } 00674 00675 bool 00676 FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize) 00677 { 00678 if (channels < getMinChannelCount() || 00679 channels > getMaxChannelCount()) return false; 00680 00681 return m_d->initialise(channels, stepSize, blockSize); 00682 } 00683 00684 void 00685 FixedTempoEstimator::reset() 00686 { 00687 return m_d->reset(); 00688 } 00689 00690 FixedTempoEstimator::ParameterList 00691 FixedTempoEstimator::getParameterDescriptors() const 00692 { 00693 return m_d->getParameterDescriptors(); 00694 } 00695 00696 float 00697 FixedTempoEstimator::getParameter(std::string id) const 00698 { 00699 return m_d->getParameter(id); 00700 } 00701 00702 void 00703 FixedTempoEstimator::setParameter(std::string id, float value) 00704 { 00705 m_d->setParameter(id, value); 00706 } 00707 00708 FixedTempoEstimator::OutputList 00709 FixedTempoEstimator::getOutputDescriptors() const 00710 { 00711 return m_d->getOutputDescriptors(); 00712 } 00713 00714 FixedTempoEstimator::FeatureSet 00715 FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts) 00716 { 00717 return m_d->process(inputBuffers, ts); 00718 } 00719 00720 FixedTempoEstimator::FeatureSet 00721 FixedTempoEstimator::getRemainingFeatures() 00722 { 00723 return m_d->getRemainingFeatures(); 00724 }