XAudio2 Tutorial 1

Author: Jay Tennant

A Brief Look at XAudio2: Playing a Sound

XAudio2 is a sound API available on the Windows Vista/7+ and XBox 360 platforms. This tutorial aims at demonstrating in brevity how to play a sound using XAudio2. Following tutorials will focus on more interesting usages of the API.

The target audience should be at least intermediate level C++ programming, including the use of pointers. Minimal use of Win32 programming is required. Understanding COM and DirectShow is beneficial, but not required. Previous experience with DirectSound will not help as much as you may think. Sorry. :(

In this series, we use the rule: code first, ask questions later. The sound clip is available here and in the Link section at the bottom. So here is the code:
//by Jay Tennant 3/4/12
//A Brief Look at XAudio2: Playing a Sound
//win32developer.com
//this code provided free, as in public domain; score!

#include <windows.h>
#include <xaudio2.h>
#include "wave.h"

IXAudio2* g_engine;
IXAudio2SourceVoice* g_source;
IXAudio2MasteringVoice* g_master;

int WINAPI WinMain( HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd )
{
	//must call this for COM
	CoInitializeEx( NULL, COINIT_MULTITHREADED );

	//create the engine
	if( FAILED( XAudio2Create( &g_engine ) ) )
	{
		CoUninitialize();
		return -1;
	}

	//create the mastering voice
	if( FAILED( g_engine->CreateMasteringVoice( &g_master ) ) )
	{
		g_engine->Release();
		CoUninitialize();
		return -2;
	}

	//helper class to load wave files; trust me, this makes it MUCH easier
	Wave buffer;

	//load a wave file
	if( !buffer.load( "sfx.wav" ) )
	{
		g_engine->Release();
		CoUninitialize();
		return -3;
	}

	//create the source voice, based on loaded wave format
	if( FAILED( g_engine->CreateSourceVoice( &g_source, buffer.wf() ) ) )
	{
		g_engine->Release();
		CoUninitialize();
		return -4;
	}

	//start consuming audio in the source voice
	g_source->Start();

	//simple message loop
	while( MessageBox( 0, TEXT("Do you want to play the sound?"), TEXT("ABLAX: PAS"), MB_YESNO ) == IDYES )
	{
		//play the sound
		g_source->SubmitSourceBuffer( buffer.xaBuffer() );
	}

	//release the engine, NOT the voices!
	g_engine->Release();

	//again, for COM
	CoUninitialize();

	return 0;
}
And the helper class "Wave" in the wave.h header file:
//wave.h
//by Jay Tennant 3/4/12
//Wave helper class, to load simple wave files
//win32developer.com
//this code provided free, as in public domain; score!

#ifndef WAVE_H
#define WAVE_H

#include <windows.h>
#include <xaudio2.h>
#include <fstream>

class Wave
{
private:
	WAVEFORMATEX m_wf;
	XAUDIO2_BUFFER m_xa;
	BYTE* m_waveData;
public:
	Wave(const char* szFile = NULL) : m_waveData(NULL) {
		ZeroMemory(&m_wf, sizeof(m_wf));
		ZeroMemory(&m_xa, sizeof(m_xa));

		load(szFile);
	}
	Wave(const Wave& c) : m_waveData(NULL) {
		m_wf = c.m_wf;
		m_xa = c.m_xa;
		if(c.m_waveData)
		{
			m_waveData = new BYTE[m_xa.AudioBytes];
			memcpy( m_waveData, c.m_waveData, m_xa.AudioBytes );
			m_xa.pAudioData = m_waveData;
		}
	}
	~Wave() {
		if(m_waveData)
			delete [] m_waveData;
		m_waveData = NULL;
	}

	const XAUDIO2_BUFFER* xaBuffer() const {return &m_xa;}
	const WAVEFORMATEX* wf() const {return &m_wf;}
	bool load(const char* szFile) {
		if(szFile == NULL)
			return false;

		std::ifstream inFile(szFile, std::ios::binary | std::ios::in);
		if(inFile.bad())
			return false;
		
		DWORD dwChunkId = 0, dwFileSize = 0, dwChunkSize = 0, dwExtra = 0;

		//look for 'RIFF' chunk identifier
		inFile.seekg(0, std::ios::beg);
		inFile.read(reinterpret_cast<char*>(&dwChunkId), sizeof(dwChunkId));
		if(dwChunkId != 'FFIR')
		{
			inFile.close();
			return false;
		}
		inFile.seekg(4, std::ios::beg); //get file size
		inFile.read(reinterpret_cast<char*>(&dwFileSize), sizeof(dwFileSize));
		if(dwFileSize <= 16)
		{
			inFile.close();
			return false;
		}
		inFile.seekg(8, std::ios::beg); //get file format
		inFile.read(reinterpret_cast<char*>(&dwExtra), sizeof(dwExtra));
		if(dwExtra != 'EVAW')
		{
			inFile.close();
			return false;
		}

		//look for 'fmt ' chunk id
		bool bFilledFormat = false;
		for(unsigned int i = 12; i < dwFileSize; )
		{
			inFile.seekg(i, std::ios::beg);
			inFile.read(reinterpret_cast<char*>(&dwChunkId), sizeof(dwChunkId));
			inFile.seekg(i + 4, std::ios::beg);
			inFile.read(reinterpret_cast<char*>(&dwChunkSize), sizeof(dwChunkSize));
			if(dwChunkId == ' tmf')
			{
				//I don't know what I was thinking with the following code, but I
				//never did solve it back 6 months, and didn't touch it since; oh well... :S

				//switch(dwChunkSize)
				//{
				//case sizeof(WAVEFORMATEX):
				//	{
				//		inFile.seekg(i + 8, std::ios::beg);
				//		inFile.read(reinterpret_cast<char*>(&m_wf), sizeof(m_wf));
				//	}
				//	break;
				//case sizeof(WAVEFORMATEXTENSIBLE):
				//	{
				//		WAVEFORMATEXTENSIBLE wfe;
				//		inFile.seekg(i + 8, std::ios::beg);
				//		inFile.read(reinterpret_cast<char*>(&wfe), sizeof(wfe));
				//		m_wf = wfe.Format;
				//	}
				//	break;
				//default:
				//	inFile.close();
				//	return;
				//}
				inFile.seekg(i + 8, std::ios::beg);
				inFile.read(reinterpret_cast<char*>(&m_wf), sizeof(m_wf));
				bFilledFormat = true;
				break;
			}
			dwChunkSize += 8; //add offsets of the chunk id, and chunk size data entries
			dwChunkSize += 1;
			dwChunkSize &= 0xfffffffe; //guarantees WORD padding alignment
			i += dwChunkSize;
		}
		if(!bFilledFormat)
		{
			inFile.close();
			return false;
		}

		//look for 'data' chunk id
		bool bFilledData = false;
		for(unsigned int i = 12; i < dwFileSize; )
		{
			inFile.seekg(i, std::ios::beg);
			inFile.read(reinterpret_cast<char*>(&dwChunkId), sizeof(dwChunkId));
			inFile.seekg(i + 4, std::ios::beg);
			inFile.read(reinterpret_cast<char*>(&dwChunkSize), sizeof(dwChunkSize));
			if(dwChunkId == 'atad')
			{
				m_waveData = new BYTE[dwChunkSize];
				inFile.seekg(i + 8, std::ios::beg);
				inFile.read(reinterpret_cast<char*>(m_waveData), dwChunkSize);
				m_xa.AudioBytes = dwChunkSize;
				m_xa.pAudioData = m_waveData;
				m_xa.PlayBegin = 0;
				m_xa.PlayLength = 0;
				bFilledData = true;
				break;
			}
			dwChunkSize += 8; //add offsets of the chunk id, and chunk size data entries
			dwChunkSize += 1;
			dwChunkSize &= 0xfffffffe; //guarantees WORD padding alignment
			i += dwChunkSize;
		}
		if(!bFilledData)
		{
			inFile.close();
			return false;
		}

		inFile.close();
		return true;
	}
};

#endif

The Analysis, Mr. Spock!

IXAudio2* g_engine;
IXAudio2SourceVoice* g_source;
IXAudio2MasteringVoice* g_master;

These three object types represent almost the entire XAudio2 framework. Details on these can be found on MSDN. In short, the engine is simply a filter graph manager. That is, the engine allows the attachment of any number of audio source voices, and pushes the buffers through the graph until it reaches the end: the mastering voice.

The Source voice is a pin to which sound buffers (either static or streaming) can feed into. Some processing can be done on the source voice, and then it is passed onto either a submix voice (discussed in later tutorial), or to the mastering voice.

The Mastering voice is the conclusion of all audio processing streams, outputting to a speaker configuration. Minimal processing is done at this stage.

IMPORTANT! Though IXAudio2, IXAudio2SourceVoice, and IXAudio2MasteringVoice all have the familiar COM interface nomenclature, only IXAudio2 is actually a COM object. Don't try to Release(), AddRef(), or QueryInterface() on an IXAudio2SourceVoice or IXAudio2MasteringVoice!

CoInitializeEx( NULL, COINIT_MULTITHREADED );

This simply starts up the COM library on this thread. It's required for the XAudio2 engine.

XAudio2Create( &g_engine )

There are many more parameters to XAudio2Create, but in short, this creates an XAudio2 engine instance. You can technically have multiple instances running in the same application, though why would you want to?

g_engine->CreateMasteringVoice( &g_master )

This function creates the Mastering voice. Again, there are many more parameters to this, but we care not for it's flexibility... yet...

g_engine->CreateSourceVoice( &g_source, buffer.wf() )

Creates a source voice based on the wave format of a sound buffer that was loaded into memory. The WAVEFORMATEX structure is defined as:

typedef struct tWAVEFORMATEX {
	WORD wFormatTag;
	WORD nChannels;
	DWORD nSamplesPerSec;
	DWORD nAvgBytesPerSec;
	WORD nBlockAlign;
	WORD wBitsPerSample;
	WORD cbSize;
} WAVEFORMATEX, *PWAVEFORMATEX, *LPWAVEFORMATEX;

The wFormatTag specifies the format (which should be PCM if loaded using the helper). nChannels is the number of channels: ie. mono 1, stereo 2, etc. nSamplesPerSec is the number of samples per second, for PCM usually 22050, 44100, 48000, etc. nAvgBytesPerSec is dependent on the format, but for PCM is the product of nSamplesPerSec and nBlockAlign. nBlockAlign is also dependent on format (sheesh, so many format dependencies!), but PCM format is product of nChannels and wBitsPerSample. wBitsPerSample is, yes, dependent on format, but for PCM it is either 8 or 16. And finally, cbSize describes the amount of extra bytes that are appended to the WAVEFORMATEX structure, useful for referencing WAVEFORMATEXTENSIBLE structure members.

g_source->Start();

Starts consuming audio on the audio processing graph. Failing to call this is akin to putting a bucket under a spicket, but not turning the water on. Nice if you want a dry bucket.

g_source->SubmitSourceBuffer( buffer.xaBuffer() );

Submits a sound buffer to the source voice. The parameter is a XAUDIO2_BUFFER pointer, which is defined in MSDN as:

typedef struct XAUDIO2_BUFFER {
	UINT32 Flags;
	UINT32 AudioBytes;
	BYTE *pAudioData;
	UINT32 PlayBegin;
	UINT32 PlayLength;
	UINT32 LoopBegin;
	UINT32 LoopLength;
	UINT32 LoopCount;
	void *pContext;
} XAUDIO2_BUFFER;

The Flags member has been set to XAUDIO2_END_OF_STREAM because there isn't any planned audio after the end of this stream. AudioBytes contains the number of bytes of the sound buffer. pAudioData is a pointer to the actual data. PlayBegin is an offset to the beginning of the audio stream, which we set to 0. We also set PlayLength to 0 to indicate the entire buffer should play. We're not interested in looping, or the pContext (which is much more advanced), but there's information on that at MSDN. So we set it all to 0.

g_engine->Release();

When it's time to close, the engine should be released. It will automatically destroy all voices.

CoUninitialize();

To unload the COM library from the thread.

If you've tried to load one yourself, you've probably found there is an incredible and irritating amount of work that goes into loading simple .wav files! The "Wave" class helper library simplifies the task of loading simple PCM waves. Feel free to use and modify it. I know I will. :)

You may have noticed that if you press the "Yes" button fast enough, the playing sound will build a queue of submissions. Try modifying the while loop to:

//simple message loop
while( MessageBox( 0, TEXT("Do you want to play the sound?"), TEXT("ABLAX: PAS"), MB_YESNO ) == IDYES )
{
	g_source->Stop();
	g_source->FlushSourceBuffers();
	g_source->Start();

	//play the sound
	g_source->SubmitSourceBuffer( buffer.xaBuffer() );
}

Additional informaton

For additional information we have provided the following links.

Sound effect used for program: sfx.zip
Cool 8bit sound generator: http://www.superflashbros.net/as3sfxr/
MSDN entry on XAudio2: http://msdn.microsoft.com/en-us/library/hh405049(VS.85).aspx


Next tutorial

Tutorial 2 - XAudio2: One buffer, multiple voices