Short-latency sound control

Hi,

I am new to Csound and to PureData. We have used PureData to implement a gesture-controlled sound generator - our emphasis is on minimizing latency between gesture (sent via OSC to Pd) and sound. I like Csound’s programmatic approach much more - but I have read here and there that PureData has more of a real-time emphasis and so may be “quicker” to respond. Is this correct ? I would use Csound if I could… and I think I would get some feedback here before/in addition to testing it myself.

Thanks !

There should be absoltuely no difference in response time between Pd and Csound. If you prefer to programmatic approach of Csound then I think you have your answer :wink:

1 Like

@suresh I don’t know which gestures are you detecting but since Csound has a very nice python bindings (ctcsound.py) you could run and control csound instruments directly from a python script and then you could e.g. take MediaPipe library (GitHub - google-ai-edge/mediapipe: Cross-platform, customizable ML solutions for live and streaming media.) for tracking e.g. hands/fingers. Here below is a script that tracks index finger of the left hand and uses it to control amplitude (y axis) and frequency (x axis):

CAMERA_ID = 0 # if there are multiple cameras connected, with this you can select which one to use

# MediaPipe parameters
MAX_NUM_HANDS = 2         
MODEL_COMPLEXITY = 0      
MIN_DETECTION_CONFIDENCE = 0.5 
MIN_TRACKING_CONFIDENCE = 0.5 

# ------------------------------------------------------------
# Imports
import cv2
import mediapipe as mp
import ctcsound

# ------------------------------------------------------------
# MediaPipe setup
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands
# ------------------------------------------------------------

# ------------------------------------------------------------
# Csound setup
# ------------------------------------------------------------
orc = '''
ksmps = 128
nchnls = 2
0dbfs = 1

instr 1
    ; IO channels
    kamp chnget "amp"      ; reading value from amp input channel 
    kfreq chnget "freq"    ; reading value from freq input channel 

    aout poscil kamp, kfreq                     ; audio synthesis
    outs aout, aout                           ; sending audio to speakers
endin
'''

cs = ctcsound.Csound()
pt = None # csound performance thread

#cs.compileCsdText(csd_text)
cs.setOption('-odac')
cs.setOption('-b1024')  # SW buffer size
cs.setOption('-B1024')  # HW buffer size
cs.compileOrc(orc)
cs.readScore('i 1 0 10000')  # run for a day
cs.start()

pt = ctcsound.CsoundPerformanceThread(cs.csound())
pt.play()

# Create control IO channels 
def createChannel(channelName):
    chn, _ = cs.channelPtr(channelName,
    ctcsound.CSOUND_CONTROL_CHANNEL | ctcsound.CSOUND_INPUT_CHANNEL)
    return chn

ampChannel = createChannel("amp")   # uses utility method to create a channel and get numpy array to write to
freqChannel = createChannel("freq")
# ------------------------------------------------------------


# ------------------------------------------------------------
# Processing part
pressed_button = False
cap = cv2.VideoCapture(CAMERA_ID)
with mp_hands.Hands(
    static_image_mode = False,
    max_num_hands = MAX_NUM_HANDS,
    model_complexity = MODEL_COMPLEXITY,
    min_detection_confidence = MIN_DETECTION_CONFIDENCE,
    min_tracking_confidence = MIN_TRACKING_CONFIDENCE) as hands:

    # Main loop
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue 
           
        # To improve performance, optionally mark the image as not writeable to pass by reference.
        image.flags.writeable = False
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = hands.process(image)

        image_height, image_width, _ = image.shape

        # Iterate through hands, draw hand landmarks on image and write to csound control channels
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        ampChannel[0] = 0.0 # if hand is not detected, set amp to 0 to turn off sound
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style())
                
                handIndex = results.multi_hand_landmarks.index(hand_landmarks)
                handLabel = results.multi_handedness[handIndex].classification[0].label

                # image is mirrored so we need to invert left and right hand detection 
                if handLabel == 'Left':  handLabel = 'Right'
                elif handLabel == 'Right':  handLabel = 'Left'

                # Write to control channels
                if handLabel == 'Left':
                    freqChannel[0] = 400 - abs(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x)*300 
                    ampChannel[0] = 1-abs(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y) 

        # Flip the image horizontally for a selfie-view display.
        cv2.imshow('MediaPipe Hands', cv2.flip(image, 1))
        if cv2.waitKey(5) & 0xFF == 27:
            break

cap.release()