/******************************************************************************** Copyright (C) 2012 Hugh Bailey This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ********************************************************************************/ #include "Main.h" #include time_t GetMacTime() { return time(0)+2082844800; } struct SampleToChunk { UINT firstChunkID; UINT samplesPerChunk; }; struct OffsetVal { UINT count; UINT val; }; struct MP4VideoFrameInfo { UINT64 fileOffset; UINT size; UINT timestamp; INT compositionOffset; }; struct MP4AudioFrameInfo { UINT64 fileOffset; UINT size; }; #define USE_64BIT_MP4 1 //code annoyance rating: fairly nightmarish class MP4FileStream : public VideoFileStream { XFileOutputSerializer fileOut; String strFile; List videoFrames; List audioFrames; List IFrameIDs; DWORD lastVideoTimestamp; bool bStreamOpened; bool bMP3; List endBuffer; List boxOffsets; UINT64 mdatStart, mdatStop; void PushBox(BufferOutputSerializer &output, DWORD boxName) { boxOffsets.Insert(0, endBuffer.Num()); output.OutputDword(0); output.OutputDword(boxName); } void PopBox() { DWORD boxSize = endBuffer.Num()-boxOffsets[0]; *(DWORD*)(endBuffer.Array()+boxOffsets[0]) = fastHtonl(boxSize); boxOffsets.Remove(0); } public: bool Init(CTSTR lpFile) { strFile = lpFile; if(!fileOut.Open(lpFile, XFILE_CREATEALWAYS, 1024*1024)) return false; fileOut.OutputDword(DWORD_BE(0x20)); fileOut.OutputDword(DWORD_BE('ftyp')); fileOut.OutputDword(DWORD_BE('isom')); fileOut.OutputDword(DWORD_BE(0x200)); fileOut.OutputDword(DWORD_BE('isom')); fileOut.OutputDword(DWORD_BE('iso2')); fileOut.OutputDword(DWORD_BE('avc1')); fileOut.OutputDword(DWORD_BE('mp41')); fileOut.OutputDword(DWORD_BE(0x8)); fileOut.OutputDword(DWORD_BE('free')); mdatStart = fileOut.GetPos(); fileOut.OutputDword(DWORD_BE(0x1)); fileOut.OutputDword(DWORD_BE('mdat')); #ifdef USE_64BIT_MP4 fileOut.OutputQword(0); #endif bMP3 = scmp(App->GetAudioEncoder()->GetCodec(), TEXT("MP3")) == 0; bStreamOpened = true; return true; } template inline void GetChunkInfo(List &data, List &chunks, List &sampleToChunks) { UINT64 curChunkOffset; UINT64 connectedSampleOffset; UINT numSamples = 0; for(UINT i=0; iGetFrameTime()); UINT audioDuration = fastHtonl(lastVideoTimestamp + DWORD(double(App->GetAudioEncoder()->GetFrameSize())/44.1)); UINT width, height; App->GetOutputSize(width, height); LPCSTR lpVideoTrack = "videoTrack"; LPCSTR lpAudioTrack = "audioTrack"; //------------------------------------------- // get video headers DataPacket videoHeaders; App->GetVideoHeaders(videoHeaders); List SPS, PPS; LPBYTE lpHeaderData = videoHeaders.lpPacket+11; SPS.CopyArray(lpHeaderData+2, fastHtons(*(WORD*)lpHeaderData)); lpHeaderData += SPS.Num()+3; PPS.CopyArray(lpHeaderData+2, fastHtons(*(WORD*)lpHeaderData)); //------------------------------------------- // get AAC headers if using AAC List AACHeader; if(!bMP3) { DataPacket data; App->GetAudioHeaders(data); AACHeader.CopyArray(data.lpPacket+2, data.size-2); } //------------------------------------------- // get chunk info List videoChunks, audioChunks; List videoSampleToChunk, audioSampleToChunk; GetChunkInfo(videoFrames, videoChunks, videoSampleToChunk); GetChunkInfo(audioFrames, audioChunks, audioSampleToChunk); //------------------------------------------- // build decode time list and composition offset list List decodeTimes; List compositionOffsets; for(UINT i=0; iGetAudioEncoder()->GetBitRate()*1024); List esDecoderDescriptor; BufferOutputSerializer esDecoderOut(esDecoderDescriptor); esDecoderOut.OutputByte(bMP3 ? 107 : 64); esDecoderOut.OutputByte(0x15); //stream/type flags. always 0x15 for my purposes. esDecoderOut.OutputWord(0); //buffer size (seems ignorable from my testing, so 0) esDecoderOut.OutputByte(0); esDecoderOut.OutputDword(maxBitRate); //max bit rate (cue bill 'o reily meme for these two) esDecoderOut.OutputDword(maxBitRate); //avg bit rate if(!bMP3) //if AAC, put in headers { esDecoderOut.OutputByte(0x5); //decoder specific descriptor type esDecoderOut.OutputByte(0x80); //some stuff that no one should probably care about esDecoderOut.OutputByte(0x80); esDecoderOut.OutputByte(0x80); esDecoderOut.OutputByte(AACHeader.Num()); esDecoderOut.Serialize((LPVOID)AACHeader.Array(), AACHeader.Num()); } esDecoderOut.OutputByte(0x6); //config descriptor type esDecoderOut.OutputByte(0x80); //some stuff that no one should probably care about esDecoderOut.OutputByte(0x80); esDecoderOut.OutputByte(0x80); esDecoderOut.OutputByte(1); //len esDecoderOut.OutputByte(2); //SL value(? always 2) List esDescriptor; BufferOutputSerializer esOut(esDescriptor); esOut.OutputWord(0); //es id esOut.OutputByte(0); //stream priority esOut.OutputByte(4); //descriptor type esOut.OutputByte(0x80); //some stuff that no one should probably care about esOut.OutputByte(0x80); esOut.OutputByte(0x80); esOut.OutputByte(esDecoderDescriptor.Num()); esOut.Serialize((LPVOID)esDecoderDescriptor.Array(), esDecoderDescriptor.Num()); //------------------------------------------- PushBox(output, DWORD_BE('moov')); //------------------------------------------------------ // header PushBox(output, DWORD_BE('mvhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(1000)); //time base (milliseconds, so 1000) output.OutputDword(videoDuration); //duration (in time base units) output.OutputDword(DWORD_BE(0x00010000)); //fixed point playback speed 1.0 output.OutputWord(WORD_BE(0x0100)); //fixed point vol 1.0 output.OutputQword(0); //reserved (10 bytes) output.OutputWord(0); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 1 (1.0, 0.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 2 (0.0, 1.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x40000000)); //window matrix row 3 (0.0, 0.0, 16384.0) output.OutputDword(0); //prevew start time (time base units) output.OutputDword(0); //prevew duration (time base units) output.OutputDword(0); //still poster frame (timestamp of frame) output.OutputDword(0); //selection(?) start time (time base units) output.OutputDword(0); //selection(?) duration (time base units) output.OutputDword(0); //current time (0, time base units) output.OutputDword(DWORD_BE(3)); //next free track id (1-based rather than 0-based) PopBox(); //mvhd //------------------------------------------------------ // video track PushBox(output, DWORD_BE('trak')); PushBox(output, DWORD_BE('tkhd')); //track header output.OutputDword(DWORD_BE(0x0000000F)); //version (0) and flags (0xF) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(1)); //track ID output.OutputDword(0); //reserved output.OutputDword(videoDuration); //duration (in time base units) output.OutputQword(0); //reserved output.OutputWord(0); //video layer (0) output.OutputWord(0); //quicktime alternate track id (0) output.OutputWord(0); //track audio volume (this is video, so 0) output.OutputWord(0); //reserved output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 1 (1.0, 0.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 2 (0.0, 1.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x40000000)); //window matrix row 3 (0.0, 0.0, 16384.0) output.OutputDword(fastHtonl(width<<16)); //width (fixed point) output.OutputDword(fastHtonl(height<<16)); //height (fixed point) PopBox(); //tkhd PushBox(output, DWORD_BE('edts')); PushBox(output, DWORD_BE('elst')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count output.OutputDword(videoDuration); //duration output.OutputDword(0); //start time output.OutputDword(DWORD_BE(0x00010000)); //playback speed (1.0) PopBox(); //elst PopBox(); //tdst PushBox(output, DWORD_BE('mdia')); PushBox(output, DWORD_BE('mdhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(1000)); //time scale output.OutputDword(videoDuration); output.OutputDword(DWORD_BE(0x55c40000)); PopBox(); //mdhd PushBox(output, DWORD_BE('hdlr')); output.OutputDword(0); //version and flags (none) output.OutputDword(0); //quicktime type (none) output.OutputDword(DWORD_BE('vide')); //media type output.OutputDword(0); //manufacturer reserved output.OutputDword(0); //quicktime component reserved flags output.OutputDword(0); //quicktime component reserved mask output.Serialize((LPVOID)lpVideoTrack, (DWORD)strlen(lpVideoTrack)+1); //track name PopBox(); //hdlr PushBox(output, DWORD_BE('minf')); PushBox(output, DWORD_BE('vmhd')); output.OutputDword(DWORD_BE(0x00000001)); //version (0) and flags (1) output.OutputWord(0); //quickdraw graphic mode (copy = 0) output.OutputWord(0); //quickdraw red value output.OutputWord(0); //quickdraw green value output.OutputWord(0); //quickdraw blue value PopBox(); //vdhd PushBox(output, DWORD_BE('dinf')); PushBox(output, DWORD_BE('dref')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('url ')); output.OutputDword(DWORD_BE(0x00000001)); //version (0) and flags (1) PopBox(); //url PopBox(); //dref PopBox(); //dinf PushBox(output, DWORD_BE('stbl')); PushBox(output, DWORD_BE('stsd')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('avc1')); output.OutputDword(0); //reserved 6 bytes output.OutputWord(0); output.OutputWord(WORD_BE(1)); //index output.OutputWord(0); //encoding version output.OutputWord(0); //encoding revision level output.OutputDword(0); //encoding vendor output.OutputDword(0); //temporal quality output.OutputDword(0); //spatial quality output.OutputWord(fastHtons(width)); //width output.OutputWord(fastHtons(height)); //height output.OutputDword(DWORD_BE(0x00480000)); //fixed point width pixel resolution (72.0) output.OutputDword(DWORD_BE(0x00480000)); //fixed point height pixel resolution (72.0) output.OutputDword(0); //quicktime video data size output.OutputWord(WORD_BE(1)); //frame count(?) for(UINT i=0; i<4; i++) //encoding name (byte 1 = string length, 31 bytes = string (whats the point of having a size here?) output.OutputQword(0); output.OutputWord(WORD_BE(24)); //bit depth output.OutputWord(0xFFFF); //quicktime video color table id (none = -1) PushBox(output, DWORD_BE('avcC')); output.OutputByte(1); //version output.OutputByte(100); //h264 profile ID output.OutputByte(0); //h264 compatible profiles output.OutputByte(0x1f); //h264 level output.OutputByte(0xff); //reserved output.OutputByte(0xe1); //first half-byte = no clue. second half = sps count output.OutputWord(fastHtons(SPS.Num())); //sps size output.Serialize(SPS.Array(), SPS.Num()); //sps data output.OutputByte(1); //pps count output.OutputWord(fastHtons(PPS.Num())); //pps size output.Serialize(PPS.Array(), PPS.Num()); //pps data PopBox(); //avcC PopBox(); //avc1 PopBox(); //stsd PushBox(output, DWORD_BE('stts')); //frame times output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(decodeTimes.Num())); for(UINT i=0; i 0xFFFFFFFFLL) { PushBox(output, DWORD_BE('co64')); //chunk offsets output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(videoChunks.Num())); for(UINT i=0; iGetAudioEncoder()->GetFrameSize())); output.OutputDword(DWORD_BE(0x55c40000)); PopBox(); //mdhd PushBox(output, DWORD_BE('hdlr')); output.OutputDword(0); //version and flags (none) output.OutputDword(0); //quicktime type (none) output.OutputDword(DWORD_BE('soun')); //media type output.OutputDword(0); //manufacturer reserved output.OutputDword(0); //quicktime component reserved flags output.OutputDword(0); //quicktime component reserved mask output.Serialize((LPVOID)lpAudioTrack, (DWORD)strlen(lpAudioTrack)+1); //track name PopBox(); //hdlr PushBox(output, DWORD_BE('minf')); PushBox(output, DWORD_BE('smhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(0); //balance (fixed point) PopBox(); //vdhd PushBox(output, DWORD_BE('dinf')); PushBox(output, DWORD_BE('dref')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('url ')); output.OutputDword(DWORD_BE(0x00000001)); //version (0) and flags (1) PopBox(); //url PopBox(); //dref PopBox(); //dinf PushBox(output, DWORD_BE('stbl')); PushBox(output, DWORD_BE('stsd')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('mp4a')); output.OutputDword(0); //reserved (6 bytes) output.OutputWord(0); output.OutputWord(WORD_BE(1)); //dref index output.OutputWord(0); //quicktime encoding version output.OutputWord(0); //quicktime encoding revision output.OutputDword(0); //quicktime audio encoding vendor output.OutputWord(WORD_BE(2)); //channels output.OutputWord(WORD_BE(16)); //sample size output.OutputWord(0); //quicktime audio compression id output.OutputWord(0); //quicktime audio packet size output.OutputDword(DWORD_BE(44100<<16)); //sample rate (fixed point) PushBox(output, DWORD_BE('esds')); output.OutputDword(0); //version and flags (none) output.OutputByte(3); //ES descriptor type output.OutputByte(0x80); output.OutputByte(0x80); output.OutputByte(0x80); output.OutputByte(esDescriptor.Num()); output.Serialize((LPVOID)esDescriptor.Array(), esDescriptor.Num()); PopBox(); PopBox(); PopBox(); //stsd PushBox(output, DWORD_BE('stts')); //list of keyframe (i-frame) IDs output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); output.OutputDword(fastHtonl(audioFrames.Num())); output.OutputDword(fastHtonl(App->GetAudioEncoder()->GetFrameSize())); PopBox(); //stss PushBox(output, DWORD_BE('stsc')); //sample to chunk list output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(audioSampleToChunk.Num())); for(UINT i=0; i 0xFFFFFFFFLL) { PushBox(output, DWORD_BE('co64')); //chunk offsets output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(audioChunks.Num())); for(UINT i=0; i= 0x80) timeOffset |= 0xFF; timeOffset = (INT)fastHtonl(DWORD(timeOffset)); if(data[0] == 0x17) //i-frame IFrameIDs << fastHtonl(videoFrames.Num()+1); MP4VideoFrameInfo frameInfo; frameInfo.fileOffset = offset; frameInfo.size = totalCopied; frameInfo.timestamp = timestamp; frameInfo.compositionOffset = timeOffset; videoFrames << frameInfo; } else videoFrames.Last().size += totalCopied; lastVideoTimestamp = timestamp; } } }; VideoFileStream* CreateMP4FileStream(CTSTR lpFile) { MP4FileStream *fileStream = new MP4FileStream; if(fileStream->Init(lpFile)) return fileStream; delete fileStream; return NULL; }