How Does pvscale work?

Emmett_Palaima · July 13, 2023, 5:24pm

Hi, I am studying the code from pvscale for a phase vocoder processing project. Had some questions about how it functions, especially since it calls on a bunch of external variables that aren’t commented.

I’m looking at the pvscale code from the repo here, which I’ll copy into the post for good measure:

static int32_t pvsscale(CSOUND *csound, PVSSCALE *p)
{
  int32_t     i, chan, N = p->fout->N;
  float   max = 0.0f;
  MYFLT   pscal = FABS(*p->kscal);
  int32_t     keepform = (int32_t) *p->keepform;
  float   g = (float) *p->gain;
  float   *fin = (float *) p->fin->frame.auxp;
  float   *fout = (float *) p->fout->frame.auxp;
  MYFLT   *fenv = (MYFLT *) p->fenv.auxp;
  float   *ftmp = (float *) p->ftmp.auxp;
  MYFLT   *ceps = (MYFLT *) p->ceps.auxp;
  float sr = CS_ESR, binf;
  int32_t coefs = (int32_t) *p->coefs;

  if (UNLIKELY(fout == NULL)) goto err1;

  if (p->fout->sliding) {
    uint32_t offset = p->h.insdshead->ksmps_offset;
    uint32_t n, nsmps = CS_KSMPS;
    int32_t NB    = p->fout->NB;
    MYFLT   g = *p->gain;
    for (n=0; n<offset; n++) {
      CMPLX   *fout = (CMPLX *) p->fout->frame.auxp + n*NB;
      for (i = 0; i < NB; i++) fout[i].re = fout[i].im = FL(0.0);
    }
    for (n=offset; n<nsmps; n++) {
      MYFLT    max = FL(0.0);
      CMPLX   *fin = (CMPLX *) p->fin->frame.auxp + n*NB;
      CMPLX   *fout = (CMPLX *) p->fout->frame.auxp + n*NB;

      fout[0] = fin[0];
      fout[NB-1] = fin[NB-1];
      if (IS_ASIG_ARG(p->kscal)) {
        pscal = FABS(p->kscal[n]);
      }
      if (keepform)
        for (i = 1; i < NB-1; i++) {
          max = max < fin[i].re ? fin[i].re : max;
        }

      for (i = 1; i < NB-1; i++) {
        if (keepform == 0 || keepform == 1 || !max)
          fout[i].re = fin[i].re;
        else
          fout[i].re = fin[i].re * (fin[i].re / max);
        fout[i].im = fin[i].im * pscal;
        /* Remove aliases */
        if (fout[i].im>=CS_ESR*0.5 ||
            fout[i].im<= -CS_ESR*0.5)
          fout[i].re=0.0;
      }

      for (i = 1; i < NB; i++) {
        fout[i].re *= g;
      }
    }
    return OK;
  }
  if (p->lastframe < p->fin->framecount) {
    int32_t n;
    fout[0] = fin[0];
    fout[N] = fin[N];
    memcpy(ftmp,fin,sizeof(float)*(N+2));

    for (i = 2, n=1; i < N; i += 2, n++) {
      fout[i] = 0.0f;
      fout[i + 1] = -1.0f;
      fenv[n] = 0.f;
    }

    if (keepform) {
      int32_t cond = 1;
      int32_t j;
      for (i=j=0; i < N; i+=2, j++)
        fenv[j] = LOG(ftmp[i] > 0.0 ? ftmp[i] : 1e-20);


      if (keepform > 2) { /* experimental mode 3 */
        int32_t w = 5, w2  = w*2;
        for (i=0; i < w; i++) ceps[i] = fenv[i];
        for (i=w; i < N/2-w; i++) {
          ceps[i] = 0.0;
          for (j=-w; j < w; j++)
            ceps[i] += fenv[i+j];
          ceps[i] /= w2;
        }
        for (i=0; i<N/2; i++) {
          fenv[i] = EXP(ceps[i]);
          max = max < fenv[i] ? fenv[i] : max;
        }
        if (max)
          for (j=i=0; i<N; i+=2, j++) {
            fenv[j]/=max;
            binf = (j)*sr/N;
            if (fenv[j] && binf < pscal*sr/2 )
              ftmp[i] /= fenv[j];
          }
      }
      else {  /* new modes 1 & 2 */
        int32_t tmp = N/2,j;
        tmp = tmp + tmp%2;
        if (coefs < 1) coefs = 80;
        while(cond) {
          cond = 0;
          for (i=0; i < N/2; i++) {
            ceps[i] = fenv[i];
          }
          if (!(N & (N - 1)))
            csound->RealFFT2(csound, p->fwdsetup, ceps);
          else
            csound->RealFFTnp2(csound, ceps, tmp);
          for (i=coefs; i < N/2; i++) ceps[i] = 0.0;
          if (!(N & (N - 1)))
            csound->RealFFT2(csound, p->invsetup, ceps);
          else
            csound->InverseRealFFTnp2(csound, ceps, tmp);
          for (i=j=0; i < N/2; i++, j+=2) {
            if (keepform > 1) {
              if (fenv[i] < ceps[i])
                fenv[i] = ceps[i];
              if ((LOG(ftmp[j]) - ceps[i]) > FL(0.23)) cond = 1;
            }
            else
              {
                fenv[i] = EXP(ceps[i]);
                max = max < fenv[i] ? fenv[i] : max;
              }
          }
        }
        if (keepform > 1)
          for (i=0; i<N/2; i++) {
            fenv[i] = EXP(ceps[i]);
            max = max < fenv[i] ? fenv[i] : max;
          }

        if (max)
          for (i=j=2; i<N/2; i++, j+=2) {
            fenv[i]/=max;
            binf = (i)*sr/N;
            if (fenv[i] && binf < pscal*sr/2 )
              ftmp[j] /= fenv[i];
          }
      }
    }
    if(keepform) {
      for (i = 2, chan = 1; i < N; chan++, i += 2) {
        int32_t newchan;
        newchan  = (int32_t) ((chan * pscal)+0.5) << 1;
        if (newchan < N && newchan > 0) {
          fout[newchan] = ftmp[i]*fenv[newchan>>1];
          fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
        }
      }
    } else {
      for (i = 2, chan = 1; i < N; chan++, i += 2) {
        int32_t newchan;
        newchan  = (int32_t) ((chan * pscal)+0.5) << 1;
        if (newchan < N && newchan > 0) {
          fout[newchan] = ftmp[i];
          fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
        }
      }
    }

    for (i = 2; i < N; i += 2) {
      if (isnan(fout[i])) fout[i] = 0.0f;
      if (fout[i + 1] == -1.0f) {
        fout[i] = 0.f;
      }
      else
        fout[i] *= g;
    }
    p->fout->framecount = p->lastframe = p->fin->framecount;
  }
  return OK;
 err1:
  return csound->PerfError(csound, &(p->h),
                           Str("pvscale: not initialised"));
}

I wanted to ask what the p->fout->sliding variable means? This seems to select between two different processing loops, and I’m not sure if it applies to my application.
What is N? This seems to be the fft frame size, but not sure.
I am not interested in formant preservation at present, so I am trying to pair the code down to essential sections without that. It seems like the essential section of the code is scaling the bins and phase coefficient by a given ratio by copying the bins to a new pvoc stream and multiplying phase by the scaling ratio. Is there anything else that is required for basic pitch shifting? Is there any reason that if I apply only this processing to a pv stream it won’t achieve the pitch shift result I hear in the example?

      for (i = 2, chan = 1; i < N; chan++, i += 2) {
        int32_t newchan;
        newchan  = (int32_t) ((chan * pscal)+0.5) << 1;
        if (newchan < N && newchan > 0) {
          fout[newchan] = ftmp[i];
          fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
        }
      }

Lovre · July 14, 2023, 11:43am

Hi @Emmett_Palaima

Unfortunately, I can’t answer your questions but there is a very nice (and not easy) chapter in The Audio Programming Book (chapter 9), written by Victor. The chapter explains and gives an example of phase vocoder implementation. Maybe if you look at that first, it will be easier to understand this code.

Best,
Lovre

Emmett_Palaima · July 14, 2023, 3:51pm

So maybe one follow up question here is what resynthesis method pvscale expects. The audio programming book mentions multiple methods, one of which is an IFFT and one of which is direct additive synthesis (the book says this is more suited to applications like pitch shifting).

Does the pvscale code expect additive resynthesis? Or will it work with the IFFT method?

Is there a method of pitch shifting that can use the IFFT method? This is for a hardware project so the processing overhead required by additive resynthesis is likely going to be prohibitive.

Emmett_Palaima · July 14, 2023, 6:59pm

Also could someone just please confirm what N refers to?