Hi, I am studying the code from pvscale for a phase vocoder processing project. Had some questions about how it functions, especially since it calls on a bunch of external variables that aren’t commented.
I’m looking at the pvscale code from the repo here, which I’ll copy into the post for good measure:
static int32_t pvsscale(CSOUND *csound, PVSSCALE *p)
{
int32_t i, chan, N = p->fout->N;
float max = 0.0f;
MYFLT pscal = FABS(*p->kscal);
int32_t keepform = (int32_t) *p->keepform;
float g = (float) *p->gain;
float *fin = (float *) p->fin->frame.auxp;
float *fout = (float *) p->fout->frame.auxp;
MYFLT *fenv = (MYFLT *) p->fenv.auxp;
float *ftmp = (float *) p->ftmp.auxp;
MYFLT *ceps = (MYFLT *) p->ceps.auxp;
float sr = CS_ESR, binf;
int32_t coefs = (int32_t) *p->coefs;
if (UNLIKELY(fout == NULL)) goto err1;
if (p->fout->sliding) {
uint32_t offset = p->h.insdshead->ksmps_offset;
uint32_t n, nsmps = CS_KSMPS;
int32_t NB = p->fout->NB;
MYFLT g = *p->gain;
for (n=0; n<offset; n++) {
CMPLX *fout = (CMPLX *) p->fout->frame.auxp + n*NB;
for (i = 0; i < NB; i++) fout[i].re = fout[i].im = FL(0.0);
}
for (n=offset; n<nsmps; n++) {
MYFLT max = FL(0.0);
CMPLX *fin = (CMPLX *) p->fin->frame.auxp + n*NB;
CMPLX *fout = (CMPLX *) p->fout->frame.auxp + n*NB;
fout[0] = fin[0];
fout[NB-1] = fin[NB-1];
if (IS_ASIG_ARG(p->kscal)) {
pscal = FABS(p->kscal[n]);
}
if (keepform)
for (i = 1; i < NB-1; i++) {
max = max < fin[i].re ? fin[i].re : max;
}
for (i = 1; i < NB-1; i++) {
if (keepform == 0 || keepform == 1 || !max)
fout[i].re = fin[i].re;
else
fout[i].re = fin[i].re * (fin[i].re / max);
fout[i].im = fin[i].im * pscal;
/* Remove aliases */
if (fout[i].im>=CS_ESR*0.5 ||
fout[i].im<= -CS_ESR*0.5)
fout[i].re=0.0;
}
for (i = 1; i < NB; i++) {
fout[i].re *= g;
}
}
return OK;
}
if (p->lastframe < p->fin->framecount) {
int32_t n;
fout[0] = fin[0];
fout[N] = fin[N];
memcpy(ftmp,fin,sizeof(float)*(N+2));
for (i = 2, n=1; i < N; i += 2, n++) {
fout[i] = 0.0f;
fout[i + 1] = -1.0f;
fenv[n] = 0.f;
}
if (keepform) {
int32_t cond = 1;
int32_t j;
for (i=j=0; i < N; i+=2, j++)
fenv[j] = LOG(ftmp[i] > 0.0 ? ftmp[i] : 1e-20);
if (keepform > 2) { /* experimental mode 3 */
int32_t w = 5, w2 = w*2;
for (i=0; i < w; i++) ceps[i] = fenv[i];
for (i=w; i < N/2-w; i++) {
ceps[i] = 0.0;
for (j=-w; j < w; j++)
ceps[i] += fenv[i+j];
ceps[i] /= w2;
}
for (i=0; i<N/2; i++) {
fenv[i] = EXP(ceps[i]);
max = max < fenv[i] ? fenv[i] : max;
}
if (max)
for (j=i=0; i<N; i+=2, j++) {
fenv[j]/=max;
binf = (j)*sr/N;
if (fenv[j] && binf < pscal*sr/2 )
ftmp[i] /= fenv[j];
}
}
else { /* new modes 1 & 2 */
int32_t tmp = N/2,j;
tmp = tmp + tmp%2;
if (coefs < 1) coefs = 80;
while(cond) {
cond = 0;
for (i=0; i < N/2; i++) {
ceps[i] = fenv[i];
}
if (!(N & (N - 1)))
csound->RealFFT2(csound, p->fwdsetup, ceps);
else
csound->RealFFTnp2(csound, ceps, tmp);
for (i=coefs; i < N/2; i++) ceps[i] = 0.0;
if (!(N & (N - 1)))
csound->RealFFT2(csound, p->invsetup, ceps);
else
csound->InverseRealFFTnp2(csound, ceps, tmp);
for (i=j=0; i < N/2; i++, j+=2) {
if (keepform > 1) {
if (fenv[i] < ceps[i])
fenv[i] = ceps[i];
if ((LOG(ftmp[j]) - ceps[i]) > FL(0.23)) cond = 1;
}
else
{
fenv[i] = EXP(ceps[i]);
max = max < fenv[i] ? fenv[i] : max;
}
}
}
if (keepform > 1)
for (i=0; i<N/2; i++) {
fenv[i] = EXP(ceps[i]);
max = max < fenv[i] ? fenv[i] : max;
}
if (max)
for (i=j=2; i<N/2; i++, j+=2) {
fenv[i]/=max;
binf = (i)*sr/N;
if (fenv[i] && binf < pscal*sr/2 )
ftmp[j] /= fenv[i];
}
}
}
if(keepform) {
for (i = 2, chan = 1; i < N; chan++, i += 2) {
int32_t newchan;
newchan = (int32_t) ((chan * pscal)+0.5) << 1;
if (newchan < N && newchan > 0) {
fout[newchan] = ftmp[i]*fenv[newchan>>1];
fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
}
}
} else {
for (i = 2, chan = 1; i < N; chan++, i += 2) {
int32_t newchan;
newchan = (int32_t) ((chan * pscal)+0.5) << 1;
if (newchan < N && newchan > 0) {
fout[newchan] = ftmp[i];
fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
}
}
}
for (i = 2; i < N; i += 2) {
if (isnan(fout[i])) fout[i] = 0.0f;
if (fout[i + 1] == -1.0f) {
fout[i] = 0.f;
}
else
fout[i] *= g;
}
p->fout->framecount = p->lastframe = p->fin->framecount;
}
return OK;
err1:
return csound->PerfError(csound, &(p->h),
Str("pvscale: not initialised"));
}
-
I wanted to ask what the p->fout->sliding variable means? This seems to select between two different processing loops, and I’m not sure if it applies to my application.
-
What is N? This seems to be the fft frame size, but not sure.
-
I am not interested in formant preservation at present, so I am trying to pair the code down to essential sections without that. It seems like the essential section of the code is scaling the bins and phase coefficient by a given ratio by copying the bins to a new pvoc stream and multiplying phase by the scaling ratio. Is there anything else that is required for basic pitch shifting? Is there any reason that if I apply only this processing to a pv stream it won’t achieve the pitch shift result I hear in the example?
for (i = 2, chan = 1; i < N; chan++, i += 2) {
int32_t newchan;
newchan = (int32_t) ((chan * pscal)+0.5) << 1;
if (newchan < N && newchan > 0) {
fout[newchan] = ftmp[i];
fout[newchan + 1] = (float) (ftmp[i + 1] * pscal);
}
}