/******************************************************************************** Copyright (C) 2012 Hugh Bailey This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. ********************************************************************************/ #include "Main.h" #include time_t GetMacTime() { return time(0)+2082844800; } struct SampleToChunk { UINT firstChunkID; UINT samplesPerChunk; }; struct OffsetVal { UINT count; UINT val; }; struct MP4VideoFrameInfo { UINT64 fileOffset; UINT size; UINT timestamp; INT compositionOffset; }; struct MP4AudioFrameInfo { UINT64 fileOffset; UINT size; UINT timestamp; }; #define USE_64BIT_MP4 1 inline UINT64 ConvertToAudioTime(DWORD timestamp, UINT64 minVal) { UINT val = UINT64(timestamp)*44100/1000; return MAX(val, minVal); } //code annoyance rating: nightmarish class MP4FileStream : public VideoFileStream { XFileOutputSerializer fileOut; String strFile; List videoFrames; List audioFrames; List IFrameIDs; DWORD lastVideoTimestamp; bool bStreamOpened; bool bMP3; List endBuffer; List boxOffsets; //chunk stuiff UINT64 connectedAudioSampleOffset, connectedVideoSampleOffset; UINT64 curVideoChunkOffset, curAudioChunkOffset; UINT numVideoSamples, numAudioSamples; List videoChunks, audioChunks; List videoSampleToChunk, audioSampleToChunk; //decode times and composition offsets UINT64 lastAudioTimeVal; UINT64 audioFrameSize; List videoDecodeTimes, audioDecodeTimes; List compositionOffsets; UINT64 mdatStart, mdatStop; bool bCancelMP4Build; bool bSentSEI; void PushBox(BufferOutputSerializer &output, DWORD boxName) { boxOffsets.Insert(0, endBuffer.Num()); output.OutputDword(0); output.OutputDword(boxName); } void PopBox() { DWORD boxSize = endBuffer.Num()-boxOffsets[0]; *(DWORD*)(endBuffer.Array()+boxOffsets[0]) = fastHtonl(boxSize); boxOffsets.Remove(0); } static INT_PTR CALLBACK MP4ProgressDialogProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam) { switch(message) { case WM_INITDIALOG: LocalizeWindow(hwnd); SetWindowLongPtr(hwnd, DWLP_USER, (LONG_PTR)lParam); return TRUE; case WM_COMMAND: switch(LOWORD(wParam)) { case IDCANCEL: if(MessageBox(hwnd, Str("MP4ProgressDialog.ConfirmStop"), Str("MP4ProgressDialog.ConfirmStopTitle"), MB_YESNO) == IDYES) { MP4FileStream *fileStream = (MP4FileStream*)GetWindowLongPtr(hwnd, DWLP_USER); fileStream->bCancelMP4Build = true; EndDialog(hwnd, IDCANCEL); } break; } } return 0; } public: bool Init(CTSTR lpFile) { strFile = lpFile; if(!fileOut.Open(lpFile, XFILE_CREATEALWAYS, 1024*1024)) return false; fileOut.OutputDword(DWORD_BE(0x20)); fileOut.OutputDword(DWORD_BE('ftyp')); fileOut.OutputDword(DWORD_BE('isom')); fileOut.OutputDword(DWORD_BE(0x200)); fileOut.OutputDword(DWORD_BE('isom')); fileOut.OutputDword(DWORD_BE('iso2')); fileOut.OutputDword(DWORD_BE('avc1')); fileOut.OutputDword(DWORD_BE('mp41')); fileOut.OutputDword(DWORD_BE(0x8)); fileOut.OutputDword(DWORD_BE('free')); mdatStart = fileOut.GetPos(); fileOut.OutputDword(DWORD_BE(0x1)); fileOut.OutputDword(DWORD_BE('mdat')); #ifdef USE_64BIT_MP4 fileOut.OutputQword(0); #endif bMP3 = scmp(App->GetAudioEncoder()->GetCodec(), TEXT("MP3")) == 0; audioFrameSize = App->GetAudioEncoder()->GetFrameSize(); bStreamOpened = true; return true; } template void GetChunkInfo(const T &data, UINT index, List &chunks, List &sampleToChunks, UINT64 &curChunkOffset, UINT64 &connectedSampleOffset, UINT &numSamples) { UINT64 curOffset = data.fileOffset; if(index == 0) curChunkOffset = curOffset; else { if(curOffset != connectedSampleOffset) { chunks << curChunkOffset; if(!sampleToChunks.Num() || sampleToChunks.Last().samplesPerChunk != numSamples) { SampleToChunk stc; stc.firstChunkID = chunks.Num(); stc.samplesPerChunk = numSamples; sampleToChunks << stc; } curChunkOffset = curOffset; numSamples = 0; } } numSamples++; connectedSampleOffset = curOffset+data.size; } inline void EndChunkInfo(List &chunks, List &sampleToChunks, UINT64 &curChunkOffset, UINT &numSamples) { chunks << curChunkOffset; if(!sampleToChunks.Num() || sampleToChunks.Last().samplesPerChunk != numSamples) { SampleToChunk stc; stc.firstChunkID = chunks.Num(); stc.samplesPerChunk = numSamples; sampleToChunks << stc; } } void GetVideoDecodeTime(MP4VideoFrameInfo &videoFrame, bool bLast) { UINT frameTime; if(bLast) frameTime = videoDecodeTimes.Last().val; else frameTime = videoFrame.timestamp-videoFrames.Last().timestamp; if(!videoDecodeTimes.Num() || videoDecodeTimes.Last().val != (UINT)frameTime) { OffsetVal newVal; newVal.count = 1; newVal.val = (UINT)frameTime; videoDecodeTimes << newVal; } else videoDecodeTimes.Last().count++; INT compositionOffset = videoFrames.Last().compositionOffset; if(!compositionOffsets.Num() || compositionOffsets.Last().val != (UINT)compositionOffset) { OffsetVal newVal; newVal.count = 1; newVal.val = (UINT)compositionOffset; compositionOffsets << newVal; } else compositionOffsets.Last().count++; } void GetAudioDecodeTime(MP4AudioFrameInfo &audioFrame, bool bLast) { UINT frameTime; if(bLast) frameTime = audioDecodeTimes.Last().val; else { UINT64 newTimeVal = lastAudioTimeVal+audioFrameSize; if(audioFrames.Num() > 1) { UINT64 convertedTime = ConvertToAudioTime(audioFrame.timestamp, audioFrameSize*audioFrames.Num()); if(convertedTime > newTimeVal) newTimeVal = convertedTime; } frameTime = UINT(newTimeVal - lastAudioTimeVal); lastAudioTimeVal = newTimeVal; } if(!audioDecodeTimes.Num() || audioDecodeTimes.Last().val != (UINT)frameTime) { OffsetVal newVal; newVal.count = 1; newVal.val = (UINT)frameTime; audioDecodeTimes << newVal; } else audioDecodeTimes.Last().count++; } ~MP4FileStream() { if(!bStreamOpened) return; App->EnableSceneSwitching(false); //--------------------------------------------------- HWND hwndProgressDialog = CreateDialog(hinstMain, MAKEINTRESOURCE(IDD_BUILDINGMP4), hwndMain, (DLGPROC)MP4ProgressDialogProc); SendMessage(GetDlgItem(hwndProgressDialog, IDC_PROGRESS1), PBM_SETRANGE32, 0, 100); mdatStop = fileOut.GetPos(); BufferOutputSerializer output(endBuffer); UINT64 audioFrameSize = App->GetAudioEncoder()->GetFrameSize(); DWORD macTime = fastHtonl(DWORD(GetMacTime())); UINT videoDuration = fastHtonl(lastVideoTimestamp + App->GetFrameTime()); UINT audioDuration = fastHtonl(lastVideoTimestamp + DWORD(double(audioFrameSize)/44.1)); UINT width, height; App->GetOutputSize(width, height); LPCSTR lpVideoTrack = "Video Media Handler"; LPCSTR lpAudioTrack = "Sound Media Handler"; char videoCompressionName[31] = "AVC Coding"; //------------------------------------------- // get video headers DataPacket videoHeaders; App->GetVideoHeaders(videoHeaders); List SPS, PPS; LPBYTE lpHeaderData = videoHeaders.lpPacket+11; SPS.CopyArray(lpHeaderData+2, fastHtons(*(WORD*)lpHeaderData)); lpHeaderData += SPS.Num()+3; PPS.CopyArray(lpHeaderData+2, fastHtons(*(WORD*)lpHeaderData)); //------------------------------------------- // get AAC headers if using AAC List AACHeader; if(!bMP3) { DataPacket data; App->GetAudioHeaders(data); AACHeader.CopyArray(data.lpPacket+2, data.size-2); } //------------------------------------------- EndChunkInfo(videoChunks, videoSampleToChunk, curVideoChunkOffset, numVideoSamples); EndChunkInfo(audioChunks, audioSampleToChunk, curAudioChunkOffset, numAudioSamples); GetVideoDecodeTime(videoFrames.Last(), true); GetAudioDecodeTime(audioFrames.Last(), true); UINT audioUnitDuration = fastHtonl(UINT(lastAudioTimeVal)); SendMessage(GetDlgItem(hwndProgressDialog, IDC_PROGRESS1), PBM_SETPOS, 25, 0); //------------------------------------------- // sound descriptor thingy. this part made me die a little inside admittedly. UINT maxBitRate = fastHtonl(App->GetAudioEncoder()->GetBitRate()*1000); List esDecoderDescriptor; BufferOutputSerializer esDecoderOut(esDecoderDescriptor); esDecoderOut.OutputByte(bMP3 ? 107 : 64); esDecoderOut.OutputByte(0x15); //stream/type flags. always 0x15 for my purposes. esDecoderOut.OutputByte(0); //buffer size, just set it to 1536 for both mp3 and aac esDecoderOut.OutputWord(WORD_BE(0x600)); esDecoderOut.OutputDword(maxBitRate); //max bit rate (cue bill 'o reily meme for these two) esDecoderOut.OutputDword(maxBitRate); //avg bit rate if(!bMP3) //if AAC, put in headers { esDecoderOut.OutputByte(0x5); //decoder specific descriptor type /*esDecoderOut.OutputByte(0x80); //some stuff that no one should probably care about esDecoderOut.OutputByte(0x80); esDecoderOut.OutputByte(0x80);*/ esDecoderOut.OutputByte(AACHeader.Num()); esDecoderOut.Serialize((LPVOID)AACHeader.Array(), AACHeader.Num()); } List esDescriptor; BufferOutputSerializer esOut(esDescriptor); esOut.OutputWord(0); //es id esOut.OutputByte(0); //stream priority esOut.OutputByte(4); //descriptor type /*esOut.OutputByte(0x80); //some stuff that no one should probably care about esOut.OutputByte(0x80); esOut.OutputByte(0x80);*/ esOut.OutputByte(esDecoderDescriptor.Num()); esOut.Serialize((LPVOID)esDecoderDescriptor.Array(), esDecoderDescriptor.Num()); esOut.OutputByte(0x6); //config descriptor type /*esOut.OutputByte(0x80); //some stuff that no one should probably care about esOut.OutputByte(0x80); esOut.OutputByte(0x80);*/ esOut.OutputByte(1); //len esOut.OutputByte(2); //SL value(? always 2) //------------------------------------------- PushBox(output, DWORD_BE('moov')); //------------------------------------------------------ // header PushBox(output, DWORD_BE('mvhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(1000)); //time base (milliseconds, so 1000) output.OutputDword(videoDuration); //duration (in time base units) output.OutputDword(DWORD_BE(0x00010000)); //fixed point playback speed 1.0 output.OutputWord(WORD_BE(0x0100)); //fixed point vol 1.0 output.OutputQword(0); //reserved (10 bytes) output.OutputWord(0); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 1 (1.0, 0.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 2 (0.0, 1.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x40000000)); //window matrix row 3 (0.0, 0.0, 16384.0) output.OutputDword(0); //prevew start time (time base units) output.OutputDword(0); //prevew duration (time base units) output.OutputDword(0); //still poster frame (timestamp of frame) output.OutputDword(0); //selection(?) start time (time base units) output.OutputDword(0); //selection(?) duration (time base units) output.OutputDword(0); //current time (0, time base units) output.OutputDword(DWORD_BE(3)); //next free track id (1-based rather than 0-based) PopBox(); //mvhd //------------------------------------------------------ // audio track PushBox(output, DWORD_BE('trak')); PushBox(output, DWORD_BE('tkhd')); //track header output.OutputDword(DWORD_BE(0x00000007)); //version (0) and flags (0xF) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(1)); //track ID output.OutputDword(0); //reserved output.OutputDword(audioDuration); //duration (in time base units) output.OutputQword(0); //reserved output.OutputWord(0); //video layer (0) output.OutputWord(WORD_BE(0)); //quicktime alternate track id output.OutputWord(WORD_BE(0x0100)); //volume output.OutputWord(0); //reserved output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 1 (1.0, 0.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00010000)); output.OutputDword(DWORD_BE(0x00000000)); //window matrix row 2 (0.0, 1.0, 0.0) output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x00000000)); output.OutputDword(DWORD_BE(0x40000000)); //window matrix row 3 (0.0, 0.0, 16384.0) output.OutputDword(0); //width (fixed point) output.OutputDword(0); //height (fixed point) PopBox(); //tkhd /*PushBox(output, DWORD_BE('edts')); PushBox(output, DWORD_BE('elst')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count output.OutputDword(audioDuration); //duration output.OutputDword(0); //start time output.OutputDword(DWORD_BE(0x00010000)); //playback speed (1.0) PopBox(); //elst PopBox(); //tdst*/ PushBox(output, DWORD_BE('mdia')); PushBox(output, DWORD_BE('mdhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(macTime); //creation time output.OutputDword(macTime); //modified time output.OutputDword(DWORD_BE(44100)); //time scale output.OutputDword(audioUnitDuration); output.OutputDword(bMP3 ? DWORD_BE(0x55c40000) : DWORD_BE(0x15c70000)); PopBox(); //mdhd PushBox(output, DWORD_BE('hdlr')); output.OutputDword(0); //version and flags (none) output.OutputDword(0); //quicktime type (none) output.OutputDword(DWORD_BE('soun')); //media type output.OutputDword(0); //manufacturer reserved output.OutputDword(0); //quicktime component reserved flags output.OutputDword(0); //quicktime component reserved mask output.Serialize((LPVOID)lpAudioTrack, (DWORD)strlen(lpAudioTrack)+1); //track name PopBox(); //hdlr PushBox(output, DWORD_BE('minf')); PushBox(output, DWORD_BE('smhd')); output.OutputDword(0); //version and flags (none) output.OutputDword(0); //balance (fixed point) PopBox(); //vdhd PushBox(output, DWORD_BE('dinf')); PushBox(output, DWORD_BE('dref')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('url ')); output.OutputDword(DWORD_BE(0x00000001)); //version (0) and flags (1) PopBox(); //url PopBox(); //dref PopBox(); //dinf PushBox(output, DWORD_BE('stbl')); PushBox(output, DWORD_BE('stsd')); output.OutputDword(0); //version and flags (none) output.OutputDword(DWORD_BE(1)); //count PushBox(output, DWORD_BE('mp4a')); output.OutputDword(0); //reserved (6 bytes) output.OutputWord(0); output.OutputWord(WORD_BE(1)); //dref index output.OutputWord(0); //quicktime encoding version output.OutputWord(0); //quicktime encoding revision output.OutputDword(0); //quicktime audio encoding vendor output.OutputWord(0); //channels (ignored) output.OutputWord(WORD_BE(16)); //sample size output.OutputWord(0); //quicktime audio compression id output.OutputWord(0); //quicktime audio packet size output.OutputDword(DWORD_BE(44100<<16)); //sample rate (fixed point) PushBox(output, DWORD_BE('esds')); output.OutputDword(0); //version and flags (none) output.OutputByte(3); //ES descriptor type /*output.OutputByte(0x80); output.OutputByte(0x80); output.OutputByte(0x80);*/ output.OutputByte(esDescriptor.Num()); output.Serialize((LPVOID)esDescriptor.Array(), esDescriptor.Num()); PopBox(); PopBox(); PopBox(); //stsd PushBox(output, DWORD_BE('stts')); //list of keyframe (i-frame) IDs output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(audioDecodeTimes.Num())); for(UINT i=0; i 0xFFFFFFFFLL) { PushBox(output, DWORD_BE('co64')); //chunk offsets output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(audioChunks.Num())); for(UINT i=0; i 0xFFFFFFFFLL) { PushBox(output, DWORD_BE('co64')); //chunk offsets output.OutputDword(0); //version and flags (none) output.OutputDword(fastHtonl(videoChunks.Num())); for(UINT i=0; iEnableSceneSwitching(true); DestroyWindow(hwndProgressDialog); } virtual void AddPacket(BYTE *data, UINT size, DWORD timestamp, PacketType type) { UINT64 offset = fileOut.GetPos(); if(type == PacketType_Audio) { UINT copySize; if(bMP3) { copySize = size-1; fileOut.Serialize(data+1, copySize); } else { copySize = size-2; fileOut.Serialize(data+2, copySize); } MP4AudioFrameInfo audioFrame; audioFrame.fileOffset = offset; audioFrame.size = copySize; audioFrame.timestamp = timestamp; GetChunkInfo(audioFrame, audioFrames.Num(), audioChunks, audioSampleToChunk, curAudioChunkOffset, connectedAudioSampleOffset, numAudioSamples); if(audioFrames.Num()) GetAudioDecodeTime(audioFrames.Last(), false); audioFrames << audioFrame; } else { UINT totalCopied = 0; if(data[0] == 0x17 && data[1] == 0) //if SPS/PPS { LPBYTE lpData = data+11; UINT spsSize = fastHtons(*(WORD*)lpData); fileOut.OutputWord(0); fileOut.Serialize(lpData, spsSize+2); lpData += spsSize+3; UINT ppsSize = fastHtons(*(WORD*)lpData); fileOut.OutputWord(0); fileOut.Serialize(lpData, ppsSize+2); totalCopied = spsSize+ppsSize+8; } else { if (!bSentSEI) { DataPacket sei; App->GetVideoEncoder()->GetSEI(sei); fileOut.Serialize(sei.lpPacket, sei.size); totalCopied += sei.size; bSentSEI = true; } totalCopied += size-5; fileOut.Serialize(data+5, size-5); } if(!videoFrames.Num() || timestamp != lastVideoTimestamp) { INT timeOffset = 0; mcpy(((BYTE*)&timeOffset)+1, data+2, 3); if(data[2] >= 0x80) timeOffset |= 0xFF; timeOffset = (INT)fastHtonl(DWORD(timeOffset)); if(data[0] == 0x17) //i-frame IFrameIDs << fastHtonl(videoFrames.Num()+1); MP4VideoFrameInfo frameInfo; frameInfo.fileOffset = offset; frameInfo.size = totalCopied; frameInfo.timestamp = timestamp; frameInfo.compositionOffset = timeOffset; GetChunkInfo(frameInfo, videoFrames.Num(), videoChunks, videoSampleToChunk, curVideoChunkOffset, connectedVideoSampleOffset, numVideoSamples); if(videoFrames.Num()) GetVideoDecodeTime(frameInfo, false); videoFrames << frameInfo; } else videoFrames.Last().size += totalCopied; lastVideoTimestamp = timestamp; } } }; VideoFileStream* CreateMP4FileStream(CTSTR lpFile) { MP4FileStream *fileStream = new MP4FileStream; if(fileStream->Init(lpFile)) return fileStream; delete fileStream; return NULL; }