Native CoreAudio renderer with spatial audio

* Inspired by an example app from Apple. [1] * Needs an M1 or newer Mac. Eventually should work on iOS/tvOS, although you might need iOS 18. * Operates in a standard passthrough mode for stereo or when you have enough real channels (HDMI). * When headphones or built-im Macbook speakers are detected, this enables spatial audio via the amazing AUSpatialMixer, which is capable of rendering any number of channels up to 7.1.4 Atmos in very high quality binaural stereo. * Supports personalized HRTF if you've scanned your ears with your iPhone. * Added a new section in the upper-right of the stats overlay with audio stats. Planned features: * Head-tracking is possible but disabled until there is a config option. Also, the system sound menu doesn't indicate spatial audio is active, giving you no way to change any settings. [1] https://developer.apple.com/documentation/audiotoolbox/generating_spatial_audio_from_a_multichannel_audio_stream
moonlight-stream · Aug 30, 2024 · 9c1b8ca · 9c1b8ca
1 parent 302dca6
commit 9c1b8ca
Show file tree

Hide file tree

Showing 30 changed files with 2,069 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 
 
 **/.vs/
+.vscode/
 build/
 config.tests/*/.qmake.stash
 config.tests/*/Makefile
diff --git a/app/app.pro b/app/app.pro
@@ -160,10 +160,18 @@ macx {
         CONFIG += discord-rpc
     }
 
-    LIBS += -lobjc -framework VideoToolbox -framework AVFoundation -framework CoreVideo -framework CoreGraphics -framework CoreMedia -framework AppKit -framework Metal -framework QuartzCore
-
-    # For libsoundio
-    LIBS += -framework CoreAudio -framework AudioUnit
+    LIBS += -lobjc \
+        -framework AppKit \
+        -framework AudioToolbox \
+        -framework AudioUnit \
+        -framework AVFoundation \
+        -framework CoreAudio \
+        -framework CoreVideo \
+        -framework CoreGraphics \
+        -framework CoreMedia \
+        -framework Metal \
+        -framework QuartzCore \
+        -framework VideoToolbox
 
     CONFIG += ffmpeg soundio
 }
@@ -391,14 +399,23 @@ win32:!winrt {
         streaming/video/ffmpeg-renderers/pacer/dxvsyncsource.h
 }
 macx {
-    message(VideoToolbox renderer selected)
+    message(CoreAudio + VideoToolbox renderers selected)
+
+    DEFINES += HAVE_COREAUDIO
 
     SOURCES += \
+        streaming/audio/renderers/coreaudio/au_spatial_renderer.mm \
+        streaming/audio/renderers/coreaudio/coreaudio.cpp \
+        streaming/audio/renderers/coreaudio/TPCircularBuffer.c \
         streaming/video/ffmpeg-renderers/vt_base.mm \
         streaming/video/ffmpeg-renderers/vt_avsamplelayer.mm \
         streaming/video/ffmpeg-renderers/vt_metal.mm
 
     HEADERS += \
+        streaming/audio/renderers/coreaudio/au_spatial_renderer.h \
+        streaming/audio/renderers/coreaudio/coreaudio.h \
+        streaming/audio/renderers/coreaudio/coreaudio_helpers.h \
+        streaming/audio/renderers/coreaudio/TPCircularBuffer.h \
         streaming/video/ffmpeg-renderers/vt.h
 }
 soundio {

diff --git a/app/spatial-audio.entitlements b/app/spatial-audio.entitlements
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.developer.spatial-audio.profile-access</key>
+	<true/>
+	<key>com.apple.developer.coremotion.head-pose</key>
+	<true/>
+</dict>
+</plist>
diff --git a/app/streaming/audio/audio.cpp b/app/streaming/audio/audio.cpp
@@ -9,6 +9,10 @@
 #include "renderers/slaud.h"
 #endif
 
+#ifdef HAVE_COREAUDIO
+#include "renderers/coreaudio/coreaudio.h"
+#endif
+
 #include "renderers/sdl.h"
 
 #include <Limelight.h>
@@ -29,6 +33,12 @@ IAudioRenderer* Session::createAudioRenderer(const POPUS_MULTISTREAM_CONFIGURATI
         TRY_INIT_RENDERER(SdlAudioRenderer, opusConfig)
         return nullptr;
     }
+#ifdef HAVE_COREAUDIO
+    else if (mlAudio == "coreaudio") {
+        TRY_INIT_RENDERER(CoreAudioRenderer, opusConfig)
+        return nullptr;
+    }
+#endif
 #ifdef HAVE_SOUNDIO
     else if (mlAudio == "libsoundio") {
         TRY_INIT_RENDERER(SoundIoAudioRenderer, opusConfig)
@@ -55,6 +65,11 @@ IAudioRenderer* Session::createAudioRenderer(const POPUS_MULTISTREAM_CONFIGURATI
     TRY_INIT_RENDERER(SLAudioRenderer, opusConfig)
 #endif
 
+#ifdef HAVE_COREAUDIO
+    // Native renderer for macOS/iOS/tvOS, suports spatial audio
+    TRY_INIT_RENDERER(CoreAudioRenderer, opusConfig)
+#endif
+
     // Default to SDL and use libsoundio as a fallback
     TRY_INIT_RENDERER(SdlAudioRenderer, opusConfig)
 #ifdef HAVE_SOUNDIO
@@ -157,6 +172,8 @@ int Session::arInit(int /* audioConfiguration */,
 
 void Session::arCleanup()
 {
+    s_ActiveSession->m_AudioRenderer->logGlobalAudioStats();
+
     delete s_ActiveSession->m_AudioRenderer;
     s_ActiveSession->m_AudioRenderer = nullptr;
 
@@ -239,6 +256,22 @@ void Session::arDecodeAndPlaySample(char* sampleData, int sampleLength)
             desiredBufferSize = 0;
         }
 
+        // used to display the raw audio bitrate
+        s_ActiveSession->m_AudioRenderer->statsAddOpusBytesReceived(sampleLength);
+
+        // Flip stats windows roughly every second
+        if (SDL_TICKS_PASSED(SDL_GetTicks(), s_ActiveSession->m_AudioRenderer->getActiveWndAudioStats().measurementStartTimestamp + 1000)) {
+            if (s_ActiveSession->getOverlayManager().isOverlayEnabled(Overlay::OverlayDebugAudio)) {
+                AUDIO_STATS lastTwoWndAudioStats = {};
+                s_ActiveSession->m_AudioRenderer->snapshotAudioStats(lastTwoWndAudioStats);
+
+                s_ActiveSession->m_AudioRenderer->stringifyAudioStats(lastTwoWndAudioStats,
+                                                                      s_ActiveSession->getOverlayManager().getOverlayText(Overlay::OverlayDebugAudio),
+                                                                      s_ActiveSession->getOverlayManager().getOverlayMaxTextLength());
+                s_ActiveSession->getOverlayManager().setOverlayTextUpdated(Overlay::OverlayDebugAudio);
+            }
+        }
+
         if (!s_ActiveSession->m_AudioRenderer->submitAudio(desiredBufferSize)) {
             SDL_LogWarn(SDL_LOG_CATEGORY_APPLICATION,
                         "Reinitializing audio renderer after failure");

diff --git a/app/streaming/audio/renderers/coreaudio/AllocatedAudioBufferList.h b/app/streaming/audio/renderers/coreaudio/AllocatedAudioBufferList.h
@@ -0,0 +1,57 @@
+/*
+Copyright © 2024 Apple Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#pragma once
+
+#include <AudioToolbox/AudioToolbox.h>
+
+class AllocatedAudioBufferList
+{
+public:
+    AllocatedAudioBufferList(UInt32 channelCount, uint16_t bufferSize)
+    {
+
+        mBufferList = static_cast<AudioBufferList *>(malloc(sizeof(AudioBufferList) + (sizeof(AudioBuffer) * channelCount)));
+        mBufferList->mNumberBuffers = channelCount;
+        for (UInt32 c = 0;  c < channelCount; ++c) {
+            mBufferList->mBuffers[c].mNumberChannels = 1;
+            mBufferList->mBuffers[c].mDataByteSize = bufferSize * sizeof(float);
+            mBufferList->mBuffers[c].mData = malloc(sizeof(float) * bufferSize);
+        }
+    }
+
+    AllocatedAudioBufferList(const AllocatedAudioBufferList&) = delete;
+
+    AllocatedAudioBufferList& operator=(const AllocatedAudioBufferList&) = delete;
+
+    ~AllocatedAudioBufferList()
+    {
+        if (mBufferList == nullptr) { return; }
+
+        for (UInt32 i = 0; i < mBufferList->mNumberBuffers; ++i) {
+            free(mBufferList->mBuffers[i].mData);
+        }
+        free(mBufferList);
+        mBufferList = nullptr;
+    }
+
+    AudioBufferList * _Nonnull get()
+    {
+        return mBufferList;
+    }
+
+private:
+    AudioBufferList * _Nonnull mBufferList  = { nullptr };
+};
diff --git a/app/streaming/audio/renderers/coreaudio/README.coreaudio b/app/streaming/audio/renderers/coreaudio/README.coreaudio
@@ -0,0 +1,60 @@
+Moonlight CoreAudio supports 2 modes:
+
+1. A normal passthrough mode where decoded PCM from the Opus stream is passed directly to the output Audio Unit. This mode
+is used when the incoming stream is stereo or when the local output device is already multichannel, e.g. when outputting over HDMI.
+
+2. Spatial Mixer mode. This mode is used for 5.1 and 7.1 channel streams, when the output device supports spatial audio. This usually means
+the system knows that headphones are in use, or the built-in Macbook speakers are in use. Apple uses a specially tuned profile to enable
+a spatial effect from their laptop speakers.
+
+There are a lot of knobs available in the mixer to describe how you want the rendering to be done, but I have hardcoded what seem
+to be Apple's recommended defaults. For example, I can find zero documentation about what the different SpatializationAlgorithm types do,
+and UseOutputType is the right choice, apparently picking the best algorithm for the target device.
+
+kSpatializationAlgorithm_EqualPowerPanning
+kSpatializationAlgorithm_HRTF
+kSpatializationAlgorithm_SoundField
+kSpatializationAlgorithm_SphericalHead
+kSpatializationAlgorithm_StereoPassThrough
+kSpatializationAlgorithm_VectorBasedPanning
+kSpatializationAlgorithm_HRTFHQ
+kSpatializationAlgorithm_UseOutputType
+
+The CoreAudio renderer was inspired by an example app in Apple's Audio Toolbox documentation:
+
+https://developer.apple.com/documentation/audiotoolbox/generating_spatial_audio_from_a_multichannel_audio_stream
+
+In theoery, any amount of channels with any layout can be processed by SpatialMixer, with 7.1.4 Atmos as Apple's example,
+in the form of a 12-channel WAV file. Interestingly, raw multichannel WAV files get automatically spatialized when played
+with QuickTime on macOS.
+
+The design and program flow of the example app is overly complex, even though it only uses 2 AudioUnits: one in stereo for final output
+and one that is a SpatialMixer. Perhaps they really wanted to show off mixing Swift UI with advanced Obj-C++ using closures/lambdas.
+
+I've left in some sections of the code that are platform-specific (iOS needs to use different audio APIs). This will
+hopefully make it easier to port this to moonlight-ios.
+
+Apple example:
+
+AudioFileReader->pullAudioBlock() <- N channel local WAV file (the example has a few 7.1.4 samples)
+  rendering->mInputBlock()
+    AudioUnitRender(mAUSM)
+      mAUSM->process()
+        Kernel->process()
+          2-channel binaural out <- OutputAU
+
+CoreAudioRenderer:
+
+A thread-safe ring buffer is used, on one end is the Opus decoder which decodes 5ms Opus packets into PCM, 32 bits-per-channel.
+The reader is one of two AURenderCallback functions that are called by CoreAudio in a pull model.
+
+renderCallbackDirect is the simple case: simply copy the PCM into the buffers being given to us by CoreAudio.
+This mode is able to pass the interleaved PCM unchanged to the OS.
+
+In Spatial mode, renderCallbackSpatial uses an intermediate SpatialMixer, which it asks for 2-channel binaural PCM
+using m_SpatialAU.process(). m_SpatialAU is our AUSpatialRenderer class that contains a lot of setup and one callback.
+The process() method calls AudioUnitRender() which will have CoreAudio call inputCallback asking for 8 channels of PCM
+data for example. This is copied out of the ring buffer, where it is stored interleaved (each channel's data is together
+and makes up one frame) and needs to be transformed to non-interleaved format into 8 separate buffers. After this, the
+mixer does whatever it does, and process() returns. We're still in renderCallbackSpatial and it can deliver the final
+2-channel version to the final output.
diff --git a/app/streaming/audio/renderers/coreaudio/TODO b/app/streaming/audio/renderers/coreaudio/TODO
@@ -0,0 +1,9 @@
+CoreAudioRenderer TODO
+----------------------
+Test the lower-quality surround modes that have coupled streams, is that broken on headphones or when spatialized?
+From https://people.xiph.org/~xiphmont/demo/opus/demo3.shtml
+
+    Surround masking takes advantage of cross-channel masking between free-field loudspeakers. Obviously, we can't do that for stereo, as stereo is often listened to on headphones or nearfield monitors, but for surround encodings played on typical surround placements with listeners placed well within the soundfield, there's considerable savings to be had by assuming freefield masking. We only need to make slight modifications to ensure that the encode still sounds good when downmixed to stereo for playback on non-surround systems.
+
+Refactor into more logical/cleaner C++ classes.
+Refactor audio stats code and implement for other backends.