2013-09-30 19:37:13 -07:00
|
|
|
/******************************************************************************
|
2014-02-13 08:58:31 -07:00
|
|
|
Copyright (C) 2013-2014 by Hugh Bailey <obs.jim@gmail.com>
|
2013-09-30 19:37:13 -07:00
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
2013-12-02 22:24:38 -07:00
|
|
|
the Free Software Foundation, either version 2 of the License, or
|
2013-09-30 19:37:13 -07:00
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
******************************************************************************/
|
|
|
|
|
2017-10-03 18:48:12 -07:00
|
|
|
#include <time.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
2013-09-30 19:37:13 -07:00
|
|
|
#include "obs.h"
|
2014-01-26 18:48:14 -07:00
|
|
|
#include "obs-internal.h"
|
2013-09-30 19:37:13 -07:00
|
|
|
#include "graphics/vec4.h"
|
2014-02-09 05:51:06 -07:00
|
|
|
#include "media-io/format-conversion.h"
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
#include "media-io/video-frame.h"
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2014-04-19 06:33:11 -07:00
|
|
|
static uint64_t tick_sources(uint64_t cur_time, uint64_t last_time)
|
2013-09-30 19:37:13 -07:00
|
|
|
{
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
struct obs_core_data *data = &obs->data;
|
|
|
|
struct obs_source *source;
|
|
|
|
uint64_t delta_time;
|
|
|
|
float seconds;
|
2013-09-30 19:37:13 -07:00
|
|
|
|
|
|
|
if (!last_time)
|
2014-08-05 15:07:54 -07:00
|
|
|
last_time = cur_time -
|
|
|
|
video_output_get_frame_time(obs->video.video);
|
|
|
|
|
2014-04-19 06:33:11 -07:00
|
|
|
delta_time = cur_time - last_time;
|
2013-09-30 19:37:13 -07:00
|
|
|
seconds = (float)((double)delta_time / 1000000000.0);
|
|
|
|
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
pthread_mutex_lock(&data->sources_mutex);
|
2013-09-30 19:37:13 -07:00
|
|
|
|
libobs: Refactor source volume transition design
This changes the way source volume handles transitioning between being
active and inactive states.
The previous way that transitioning handled volume was that it set the
presentation volume of the source and all of its sub-sources to 0.0 if
the source was inactive, and 1.0 if active. Transition sources would
then also set the presentation volume for sub-sources to whatever their
transitioning volume was. However, the problem with this is that the
design didn't take in to account if the source or its sub-sources were
active anywhere else, so because of that it would break if that ever
happened, and I didn't realize that when I was designing it.
So instead, this completely overhauls the design of handling
transitioning volume. Each frame, it'll go through all sources and
check whether they're active or inactive and set the base volume
accordingly. If transitions are currently active, it will actually walk
the active source tree and check whether the source is in a
transitioning state somewhere.
- If the source is a sub-source of a transition, and it's not active
outside of the transition, then the transition will control the
volume of the source.
- If the source is a sub-source of a transition, but it's also active
outside of the transition, it'll defer to whichever is louder.
This also adds a new callback to the obs_source_info structure for
transition sources, get_transition_volume, which is called to get the
transitioning volume of a sub-source.
2014-12-27 22:16:10 -08:00
|
|
|
/* call the tick function of each source */
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
source = data->first_source;
|
|
|
|
while (source) {
|
2015-05-03 20:45:41 +02:00
|
|
|
obs_source_video_tick(source, seconds);
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
source = (struct obs_source*)source->context.next;
|
|
|
|
}
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&data->sources_mutex);
|
|
|
|
|
|
|
|
return cur_time;
|
2013-09-30 19:37:13 -07:00
|
|
|
}
|
|
|
|
|
2014-02-13 08:58:31 -07:00
|
|
|
/* in obs-display.c */
|
|
|
|
extern void render_display(struct obs_display *display);
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2013-10-18 20:25:13 -07:00
|
|
|
static inline void render_displays(void)
|
|
|
|
{
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
struct obs_display *display;
|
|
|
|
|
2014-01-23 17:00:42 -07:00
|
|
|
if (!obs->data.valid)
|
|
|
|
return;
|
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_enter_context(obs->video.graphics);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2013-11-20 15:00:16 -07:00
|
|
|
/* render extra displays/swaps */
|
|
|
|
pthread_mutex_lock(&obs->data.displays_mutex);
|
2013-10-18 20:25:13 -07:00
|
|
|
|
libobs: Add services API, reduce repeated code
Add API for streaming services. The services API simplifies the
creation of custom service features and user interface.
Custom streaming services later on will be able to do things such as:
- Be able to use service-specific APIs via modules, allowing a more
direct means of communicating with the service and requesting or
setting service-specific information
- Get URL/stream key via other means of authentication such as OAuth,
or be able to build custom URLs for services that require that sort
of thing.
- Query information (such as viewer count, chat, follower
notifications, and other information)
- Set channel information (such as current game, current channel title,
activating commercials)
Also, I reduce some repeated code that was used for all libobs objects.
This includes the name of the object, the private data, settings, as
well as the signal and procedure handlers.
I also switched to using linked lists for the global object lists,
rather than using an array of pointers (you could say it was..
pointless.) ..Anyway, the linked list info is also stored in the shared
context data structure.
2014-04-19 20:38:53 -07:00
|
|
|
display = obs->data.first_display;
|
|
|
|
while (display) {
|
|
|
|
render_display(display);
|
|
|
|
display = display->next;
|
|
|
|
}
|
2013-10-18 20:25:13 -07:00
|
|
|
|
2013-11-20 15:00:16 -07:00
|
|
|
pthread_mutex_unlock(&obs->data.displays_mutex);
|
2013-10-18 20:25:13 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_leave_context();
|
2013-10-18 20:25:13 -07:00
|
|
|
}
|
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
static inline void set_render_size(uint32_t width, uint32_t height)
|
2013-09-30 19:37:13 -07:00
|
|
|
{
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_enable_depth_test(false);
|
|
|
|
gs_set_cull_mode(GS_NEITHER);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
gs_ortho(0.0f, (float)width, 0.0f, (float)height, -100.0f, 100.0f);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_set_viewport(0, 0, width, height);
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2014-02-05 21:03:06 -07:00
|
|
|
static inline void unmap_last_surface(struct obs_core_video *video)
|
2014-02-05 20:36:21 -07:00
|
|
|
{
|
|
|
|
if (video->mapped_surface) {
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_stagesurface_unmap(video->mapped_surface);
|
2014-02-05 20:36:21 -07:00
|
|
|
video->mapped_surface = NULL;
|
2013-09-30 19:37:13 -07:00
|
|
|
}
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *render_main_texture_name = "render_main_texture";
|
2014-02-05 21:03:06 -07:00
|
|
|
static inline void render_main_texture(struct obs_core_video *video,
|
2014-02-14 15:13:36 -07:00
|
|
|
int cur_texture)
|
2014-02-05 20:36:21 -07:00
|
|
|
{
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(render_main_texture_name);
|
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
struct vec4 clear_color;
|
2014-03-07 10:19:03 -07:00
|
|
|
vec4_set(&clear_color, 0.0f, 0.0f, 0.0f, 1.0f);
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_set_render_target(video->render_textures[cur_texture], NULL);
|
2014-02-05 20:36:21 -07:00
|
|
|
gs_clear(GS_CLEAR_COLOR, &clear_color, 1.0f, 0);
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
set_render_size(video->base_width, video->base_height);
|
2017-04-24 03:22:19 -07:00
|
|
|
|
|
|
|
pthread_mutex_lock(&obs->data.draw_callbacks_mutex);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < obs->data.draw_callbacks.num; i++) {
|
|
|
|
struct draw_callback *callback;
|
|
|
|
callback = obs->data.draw_callbacks.array+i;
|
|
|
|
|
|
|
|
callback->draw(callback->param,
|
|
|
|
video->base_width, video->base_height);
|
|
|
|
}
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&obs->data.draw_callbacks_mutex);
|
|
|
|
|
2014-02-13 10:21:16 -07:00
|
|
|
obs_view_render(&obs->data.main_view);
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
video->textures_rendered[cur_texture] = true;
|
2015-07-11 08:04:46 +02:00
|
|
|
|
|
|
|
profile_end(render_main_texture_name);
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2014-12-14 23:45:44 -08:00
|
|
|
static inline gs_effect_t *get_scale_effect_internal(
|
|
|
|
struct obs_core_video *video)
|
|
|
|
{
|
2015-04-06 07:35:09 -07:00
|
|
|
/* if the dimension is under half the size of the original image,
|
|
|
|
* bicubic/lanczos can't sample enough pixels to create an accurate
|
|
|
|
* image, so use the bilinear low resolution effect instead */
|
|
|
|
if (video->output_width < (video->base_width / 2) &&
|
|
|
|
video->output_height < (video->base_height / 2)) {
|
|
|
|
return video->bilinear_lowres_effect;
|
|
|
|
}
|
|
|
|
|
2014-12-14 23:45:44 -08:00
|
|
|
switch (video->scale_type) {
|
|
|
|
case OBS_SCALE_BILINEAR: return video->default_effect;
|
|
|
|
case OBS_SCALE_LANCZOS: return video->lanczos_effect;
|
2016-06-29 06:08:54 -07:00
|
|
|
case OBS_SCALE_BICUBIC:
|
|
|
|
default:;
|
2014-12-14 23:45:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return video->bicubic_effect;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool resolution_close(struct obs_core_video *video,
|
|
|
|
uint32_t width, uint32_t height)
|
|
|
|
{
|
|
|
|
long width_cmp = (long)video->base_width - (long)width;
|
|
|
|
long height_cmp = (long)video->base_height - (long)height;
|
|
|
|
|
|
|
|
return labs(width_cmp) <= 16 && labs(height_cmp) <= 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline gs_effect_t *get_scale_effect(struct obs_core_video *video,
|
|
|
|
uint32_t width, uint32_t height)
|
|
|
|
{
|
|
|
|
if (resolution_close(video, width, height)) {
|
|
|
|
return video->default_effect;
|
|
|
|
} else {
|
|
|
|
/* if the scale method couldn't be loaded, use either bicubic
|
|
|
|
* or bilinear by default */
|
|
|
|
gs_effect_t *effect = get_scale_effect_internal(video);
|
|
|
|
if (!effect)
|
|
|
|
effect = !!video->bicubic_effect ?
|
|
|
|
video->bicubic_effect :
|
|
|
|
video->default_effect;
|
|
|
|
return effect;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *render_output_texture_name = "render_output_texture";
|
2014-02-05 21:03:06 -07:00
|
|
|
static inline void render_output_texture(struct obs_core_video *video,
|
2014-02-05 20:36:21 -07:00
|
|
|
int cur_texture, int prev_texture)
|
|
|
|
{
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(render_output_texture_name);
|
|
|
|
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_texture_t *texture = video->render_textures[prev_texture];
|
|
|
|
gs_texture_t *target = video->output_textures[cur_texture];
|
2014-08-07 23:42:07 -07:00
|
|
|
uint32_t width = gs_texture_get_width(target);
|
|
|
|
uint32_t height = gs_texture_get_height(target);
|
2014-12-14 23:45:44 -08:00
|
|
|
struct vec2 base_i;
|
|
|
|
|
|
|
|
vec2_set(&base_i,
|
|
|
|
1.0f / (float)video->base_width,
|
|
|
|
1.0f / (float)video->base_height);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-12-14 23:45:44 -08:00
|
|
|
gs_effect_t *effect = get_scale_effect(video, width, height);
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_technique_t *tech = gs_effect_get_technique(effect, "DrawMatrix");
|
|
|
|
gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
|
|
|
|
gs_eparam_t *matrix = gs_effect_get_param_by_name(effect,
|
2014-08-07 23:42:07 -07:00
|
|
|
"color_matrix");
|
2014-12-14 23:45:44 -08:00
|
|
|
gs_eparam_t *bres_i = gs_effect_get_param_by_name(effect,
|
|
|
|
"base_dimension_i");
|
2014-02-05 20:36:21 -07:00
|
|
|
size_t passes, i;
|
|
|
|
|
|
|
|
if (!video->textures_rendered[prev_texture])
|
2015-07-11 08:04:46 +02:00
|
|
|
goto end;
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_set_render_target(target, NULL);
|
2014-02-05 20:36:21 -07:00
|
|
|
set_render_size(width, height);
|
|
|
|
|
2014-12-14 23:45:44 -08:00
|
|
|
if (bres_i)
|
|
|
|
gs_effect_set_vec2(bres_i, &base_i);
|
|
|
|
|
2014-12-11 19:51:30 -08:00
|
|
|
gs_effect_set_val(matrix, video->color_matrix, sizeof(float) * 16);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_effect_set_texture(image, texture);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-10-14 17:40:34 +02:00
|
|
|
gs_enable_blending(false);
|
2014-08-07 23:42:07 -07:00
|
|
|
passes = gs_technique_begin(tech);
|
2014-02-05 20:36:21 -07:00
|
|
|
for (i = 0; i < passes; i++) {
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_begin_pass(tech, i);
|
2014-02-05 20:36:21 -07:00
|
|
|
gs_draw_sprite(texture, 0, width, height);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_end_pass(tech);
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_end(tech);
|
2014-10-14 17:40:34 +02:00
|
|
|
gs_enable_blending(true);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
|
|
|
video->textures_output[cur_texture] = true;
|
2015-07-11 08:04:46 +02:00
|
|
|
|
|
|
|
end:
|
|
|
|
profile_end(render_output_texture_name);
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2014-09-25 17:44:05 -07:00
|
|
|
static inline void set_eparam(gs_effect_t *effect, const char *name, float val)
|
2014-02-16 19:28:21 -07:00
|
|
|
{
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_eparam_t *param = gs_effect_get_param_by_name(effect, name);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_effect_set_float(param, val);
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *render_convert_texture_name = "render_convert_texture";
|
2014-02-16 19:28:21 -07:00
|
|
|
static void render_convert_texture(struct obs_core_video *video,
|
2014-02-05 20:36:21 -07:00
|
|
|
int cur_texture, int prev_texture)
|
|
|
|
{
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(render_convert_texture_name);
|
|
|
|
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_texture_t *texture = video->output_textures[prev_texture];
|
|
|
|
gs_texture_t *target = video->convert_textures[cur_texture];
|
2014-08-07 23:42:07 -07:00
|
|
|
float fwidth = (float)video->output_width;
|
|
|
|
float fheight = (float)video->output_height;
|
|
|
|
size_t passes, i;
|
|
|
|
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_effect_t *effect = video->conversion_effect;
|
|
|
|
gs_eparam_t *image = gs_effect_get_param_by_name(effect, "image");
|
|
|
|
gs_technique_t *tech = gs_effect_get_technique(effect,
|
2014-02-16 19:28:21 -07:00
|
|
|
video->conversion_tech);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
|
|
|
if (!video->textures_output[prev_texture])
|
2015-07-11 08:04:46 +02:00
|
|
|
goto end;
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-02-16 19:28:21 -07:00
|
|
|
set_eparam(effect, "u_plane_offset", (float)video->plane_offsets[1]);
|
|
|
|
set_eparam(effect, "v_plane_offset", (float)video->plane_offsets[2]);
|
|
|
|
set_eparam(effect, "width", fwidth);
|
|
|
|
set_eparam(effect, "height", fheight);
|
|
|
|
set_eparam(effect, "width_i", 1.0f / fwidth);
|
|
|
|
set_eparam(effect, "height_i", 1.0f / fheight);
|
|
|
|
set_eparam(effect, "width_d2", fwidth * 0.5f);
|
|
|
|
set_eparam(effect, "height_d2", fheight * 0.5f);
|
|
|
|
set_eparam(effect, "width_d2_i", 1.0f / (fwidth * 0.5f));
|
|
|
|
set_eparam(effect, "height_d2_i", 1.0f / (fheight * 0.5f));
|
|
|
|
set_eparam(effect, "input_height", (float)video->conversion_height);
|
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_effect_set_texture(image, texture);
|
2014-02-16 19:28:21 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_set_render_target(target, NULL);
|
2014-02-16 19:28:21 -07:00
|
|
|
set_render_size(video->output_width, video->conversion_height);
|
|
|
|
|
2014-10-14 17:40:34 +02:00
|
|
|
gs_enable_blending(false);
|
2014-08-07 23:42:07 -07:00
|
|
|
passes = gs_technique_begin(tech);
|
2014-02-16 19:28:21 -07:00
|
|
|
for (i = 0; i < passes; i++) {
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_begin_pass(tech, i);
|
2014-02-16 19:28:21 -07:00
|
|
|
gs_draw_sprite(texture, 0, video->output_width,
|
|
|
|
video->conversion_height);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_end_pass(tech);
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_technique_end(tech);
|
2014-10-14 17:40:34 +02:00
|
|
|
gs_enable_blending(true);
|
2014-02-16 19:28:21 -07:00
|
|
|
|
|
|
|
video->textures_converted[cur_texture] = true;
|
2015-07-11 08:04:46 +02:00
|
|
|
|
|
|
|
end:
|
|
|
|
profile_end(render_convert_texture_name);
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *stage_output_texture_name = "stage_output_texture";
|
2014-02-16 19:28:21 -07:00
|
|
|
static inline void stage_output_texture(struct obs_core_video *video,
|
|
|
|
int cur_texture, int prev_texture)
|
|
|
|
{
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(stage_output_texture_name);
|
|
|
|
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_texture_t *texture;
|
2014-02-16 19:28:21 -07:00
|
|
|
bool texture_ready;
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_stagesurf_t *copy = video->copy_surfaces[cur_texture];
|
2014-02-16 19:28:21 -07:00
|
|
|
|
|
|
|
if (video->gpu_conversion) {
|
|
|
|
texture = video->convert_textures[prev_texture];
|
|
|
|
texture_ready = video->textures_converted[prev_texture];
|
|
|
|
} else {
|
|
|
|
texture = video->output_textures[prev_texture];
|
2017-09-13 16:24:06 +02:00
|
|
|
texture_ready = video->textures_output[prev_texture];
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
unmap_last_surface(video);
|
|
|
|
|
|
|
|
if (!texture_ready)
|
2015-07-11 08:04:46 +02:00
|
|
|
goto end;
|
2014-02-16 19:28:21 -07:00
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
gs_stage_texture(copy, texture);
|
|
|
|
|
|
|
|
video->textures_copied[cur_texture] = true;
|
2015-07-11 08:04:46 +02:00
|
|
|
|
|
|
|
end:
|
|
|
|
profile_end(stage_output_texture_name);
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2014-02-05 21:03:06 -07:00
|
|
|
static inline void render_video(struct obs_core_video *video, int cur_texture,
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
int prev_texture)
|
2014-02-05 20:36:21 -07:00
|
|
|
{
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_begin_scene();
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_enable_depth_test(false);
|
|
|
|
gs_set_cull_mode(GS_NEITHER);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-02-14 15:13:36 -07:00
|
|
|
render_main_texture(video, cur_texture);
|
2014-02-05 20:36:21 -07:00
|
|
|
render_output_texture(video, cur_texture, prev_texture);
|
2014-02-16 19:28:21 -07:00
|
|
|
if (video->gpu_conversion)
|
|
|
|
render_convert_texture(video, cur_texture, prev_texture);
|
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
stage_output_texture(video, cur_texture, prev_texture);
|
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_set_render_target(NULL, NULL);
|
2014-02-16 19:28:21 -07:00
|
|
|
gs_enable_blending(true);
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_end_scene();
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2014-02-14 15:13:36 -07:00
|
|
|
static inline bool download_frame(struct obs_core_video *video,
|
2014-02-18 13:37:56 -07:00
|
|
|
int prev_texture, struct video_data *frame)
|
2014-02-05 20:36:21 -07:00
|
|
|
{
|
2014-09-25 17:44:05 -07:00
|
|
|
gs_stagesurf_t *surface = video->copy_surfaces[prev_texture];
|
2014-02-05 20:36:21 -07:00
|
|
|
|
|
|
|
if (!video->textures_copied[prev_texture])
|
2014-02-09 05:51:06 -07:00
|
|
|
return false;
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
if (!gs_stagesurface_map(surface, &frame->data[0], &frame->linesize[0]))
|
2014-02-09 05:51:06 -07:00
|
|
|
return false;
|
|
|
|
|
|
|
|
video->mapped_surface = surface;
|
|
|
|
return true;
|
|
|
|
}
|
2014-02-05 20:36:21 -07:00
|
|
|
|
2014-02-16 19:28:21 -07:00
|
|
|
static inline uint32_t calc_linesize(uint32_t pos, uint32_t linesize)
|
|
|
|
{
|
|
|
|
uint32_t size = pos % linesize;
|
|
|
|
return size ? size : linesize;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void copy_dealign(
|
|
|
|
uint8_t *dst, uint32_t dst_pos, uint32_t dst_linesize,
|
|
|
|
const uint8_t *src, uint32_t src_pos, uint32_t src_linesize,
|
|
|
|
uint32_t remaining)
|
|
|
|
{
|
|
|
|
while (remaining) {
|
|
|
|
uint32_t src_remainder = src_pos % src_linesize;
|
|
|
|
uint32_t dst_offset = dst_linesize - src_remainder;
|
|
|
|
uint32_t src_offset = src_linesize - src_remainder;
|
|
|
|
|
|
|
|
if (remaining < dst_offset) {
|
|
|
|
memcpy(dst + dst_pos, src + src_pos, remaining);
|
|
|
|
src_pos += remaining;
|
|
|
|
dst_pos += remaining;
|
|
|
|
remaining = 0;
|
|
|
|
} else {
|
|
|
|
memcpy(dst + dst_pos, src + src_pos, dst_offset);
|
|
|
|
src_pos += src_offset;
|
|
|
|
dst_pos += dst_offset;
|
|
|
|
remaining -= dst_offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint32_t make_aligned_linesize_offset(uint32_t offset,
|
|
|
|
uint32_t dst_linesize, uint32_t src_linesize)
|
|
|
|
{
|
|
|
|
uint32_t remainder = offset % dst_linesize;
|
|
|
|
return (offset / dst_linesize) * src_linesize + remainder;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fix_gpu_converted_alignment(struct obs_core_video *video,
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
struct video_frame *output, const struct video_data *input)
|
2014-02-16 19:28:21 -07:00
|
|
|
{
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
uint32_t src_linesize = input->linesize[0];
|
2015-01-14 14:57:27 -08:00
|
|
|
uint32_t dst_linesize = output->linesize[0] * 4;
|
2014-02-16 19:28:21 -07:00
|
|
|
uint32_t src_pos = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < 3; i++) {
|
|
|
|
if (video->plane_linewidth[i] == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
src_pos = make_aligned_linesize_offset(video->plane_offsets[i],
|
|
|
|
dst_linesize, src_linesize);
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
copy_dealign(output->data[i], 0, dst_linesize,
|
|
|
|
input->data[0], src_pos, src_linesize,
|
2014-02-16 19:28:21 -07:00
|
|
|
video->plane_sizes[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
static void set_gpu_converted_data(struct obs_core_video *video,
|
|
|
|
struct video_frame *output, const struct video_data *input,
|
|
|
|
const struct video_output_info *info)
|
2014-02-16 19:28:21 -07:00
|
|
|
{
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
if (input->linesize[0] == video->output_width*4) {
|
|
|
|
struct video_frame frame;
|
|
|
|
|
2014-02-16 19:28:21 -07:00
|
|
|
for (size_t i = 0; i < 3; i++) {
|
|
|
|
if (video->plane_linewidth[i] == 0)
|
|
|
|
break;
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
frame.linesize[i] = video->plane_linewidth[i];
|
|
|
|
frame.data[i] =
|
|
|
|
input->data[0] + video->plane_offsets[i];
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
video_frame_copy(output, &frame, info->format, info->height);
|
|
|
|
|
2014-02-16 19:28:21 -07:00
|
|
|
} else {
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
fix_gpu_converted_alignment(video, output, input);
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
static void convert_frame(
|
|
|
|
struct video_frame *output, const struct video_data *input,
|
|
|
|
const struct video_output_info *info)
|
2014-02-09 05:51:06 -07:00
|
|
|
{
|
|
|
|
if (info->format == VIDEO_FORMAT_I420) {
|
|
|
|
compress_uyvx_to_i420(
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
input->data[0], input->linesize[0],
|
2014-02-09 05:51:06 -07:00
|
|
|
0, info->height,
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
output->data, output->linesize);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
|
|
|
} else if (info->format == VIDEO_FORMAT_NV12) {
|
|
|
|
compress_uyvx_to_nv12(
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
input->data[0], input->linesize[0],
|
2015-04-16 22:52:44 -07:00
|
|
|
0, info->height,
|
|
|
|
output->data, output->linesize);
|
|
|
|
|
|
|
|
} else if (info->format == VIDEO_FORMAT_I444) {
|
|
|
|
convert_uyvx_to_i444(
|
|
|
|
input->data[0], input->linesize[0],
|
2014-02-09 05:51:06 -07:00
|
|
|
0, info->height,
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
output->data, output->linesize);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
|
|
|
} else {
|
2014-02-28 20:02:29 -07:00
|
|
|
blog(LOG_ERROR, "convert_frame: unsupported texture format");
|
2014-02-09 05:51:06 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-04-15 18:41:09 -07:00
|
|
|
static inline void copy_rgbx_frame(
|
|
|
|
struct video_frame *output, const struct video_data *input,
|
|
|
|
const struct video_output_info *info)
|
|
|
|
{
|
|
|
|
uint8_t *in_ptr = input->data[0];
|
|
|
|
uint8_t *out_ptr = output->data[0];
|
|
|
|
|
2015-07-21 17:58:37 +02:00
|
|
|
/* if the line sizes match, do a single copy */
|
|
|
|
if (input->linesize[0] == output->linesize[0]) {
|
|
|
|
memcpy(out_ptr, in_ptr, input->linesize[0] * info->height);
|
|
|
|
} else {
|
|
|
|
for (size_t y = 0; y < info->height; y++) {
|
|
|
|
memcpy(out_ptr, in_ptr, info->width * 4);
|
|
|
|
in_ptr += input->linesize[0];
|
|
|
|
out_ptr += output->linesize[0];
|
|
|
|
}
|
2015-04-15 18:41:09 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-09 05:51:06 -07:00
|
|
|
static inline void output_video_data(struct obs_core_video *video,
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
struct video_data *input_frame, int count)
|
2014-02-09 05:51:06 -07:00
|
|
|
{
|
|
|
|
const struct video_output_info *info;
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
struct video_frame output_frame;
|
|
|
|
bool locked;
|
|
|
|
|
2014-08-05 15:07:54 -07:00
|
|
|
info = video_output_get_info(video->video);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
locked = video_output_lock_frame(video->video, &output_frame, count,
|
|
|
|
input_frame->timestamp);
|
|
|
|
if (locked) {
|
|
|
|
if (video->gpu_conversion) {
|
|
|
|
set_gpu_converted_data(video, &output_frame,
|
|
|
|
input_frame, info);
|
|
|
|
|
|
|
|
} else if (format_is_yuv(info->format)) {
|
|
|
|
convert_frame(&output_frame, input_frame, info);
|
2015-04-15 18:41:09 -07:00
|
|
|
} else {
|
|
|
|
copy_rgbx_frame(&output_frame, input_frame, info);
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
}
|
2014-02-16 19:28:21 -07:00
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
video_output_unlock_frame(video->video);
|
2014-02-16 19:28:21 -07:00
|
|
|
}
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
}
|
2014-02-09 05:51:06 -07:00
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
static inline void video_sleep(struct obs_core_video *video,
|
|
|
|
uint64_t *p_time, uint64_t interval_ns)
|
|
|
|
{
|
|
|
|
struct obs_vframe_info vframe_info;
|
|
|
|
uint64_t cur_time = *p_time;
|
|
|
|
uint64_t t = cur_time + interval_ns;
|
|
|
|
int count;
|
|
|
|
|
2015-01-05 14:07:22 -08:00
|
|
|
if (os_sleepto_ns(t)) {
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
*p_time = t;
|
|
|
|
count = 1;
|
|
|
|
} else {
|
|
|
|
count = (int)((os_gettime_ns() - cur_time) / interval_ns);
|
|
|
|
*p_time = cur_time + interval_ns * count;
|
|
|
|
}
|
|
|
|
|
2016-01-25 03:58:51 -08:00
|
|
|
video->total_frames += count;
|
|
|
|
video->lagged_frames += count - 1;
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
vframe_info.timestamp = cur_time;
|
|
|
|
vframe_info.count = count;
|
|
|
|
circlebuf_push_back(&video->vframe_info_buffer, &vframe_info,
|
|
|
|
sizeof(vframe_info));
|
2014-02-05 20:36:21 -07:00
|
|
|
}
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *output_frame_gs_context_name = "gs_context(video->graphics)";
|
|
|
|
static const char *output_frame_render_video_name = "render_video";
|
|
|
|
static const char *output_frame_download_frame_name = "download_frame";
|
|
|
|
static const char *output_frame_gs_flush_name = "gs_flush";
|
|
|
|
static const char *output_frame_output_video_data_name = "output_video_data";
|
2015-07-08 14:25:07 +02:00
|
|
|
static inline void output_frame(void)
|
2014-02-05 20:36:21 -07:00
|
|
|
{
|
2014-02-05 21:03:06 -07:00
|
|
|
struct obs_core_video *video = &obs->video;
|
2014-02-05 20:36:21 -07:00
|
|
|
int cur_texture = video->cur_texture;
|
|
|
|
int prev_texture = cur_texture == 0 ? NUM_TEXTURES-1 : cur_texture-1;
|
2014-02-18 13:37:56 -07:00
|
|
|
struct video_data frame;
|
2014-02-09 05:51:06 -07:00
|
|
|
bool frame_ready;
|
|
|
|
|
2014-02-18 13:37:56 -07:00
|
|
|
memset(&frame, 0, sizeof(struct video_data));
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(output_frame_gs_context_name);
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_enter_context(video->graphics);
|
2015-07-11 08:04:46 +02:00
|
|
|
|
|
|
|
profile_start(output_frame_render_video_name);
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
render_video(video, cur_texture, prev_texture);
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_render_video_name);
|
|
|
|
|
|
|
|
profile_start(output_frame_download_frame_name);
|
2014-02-14 15:13:36 -07:00
|
|
|
frame_ready = download_frame(video, prev_texture, &frame);
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_download_frame_name);
|
|
|
|
|
|
|
|
profile_start(output_frame_gs_flush_name);
|
2014-12-03 22:14:23 -08:00
|
|
|
gs_flush();
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_gs_flush_name);
|
|
|
|
|
2014-08-07 23:42:07 -07:00
|
|
|
gs_leave_context();
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_gs_context_name);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2014-10-21 20:08:39 -07:00
|
|
|
if (frame_ready) {
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
struct obs_vframe_info vframe_info;
|
|
|
|
circlebuf_pop_front(&video->vframe_info_buffer, &vframe_info,
|
|
|
|
sizeof(vframe_info));
|
2014-10-21 20:08:39 -07:00
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
frame.timestamp = vframe_info.timestamp;
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(output_frame_output_video_data_name);
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
output_video_data(video, &frame, vframe_info.count);
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_output_video_data_name);
|
2014-10-21 20:08:39 -07:00
|
|
|
}
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2014-02-05 20:36:21 -07:00
|
|
|
if (++video->cur_texture == NUM_TEXTURES)
|
|
|
|
video->cur_texture = 0;
|
2013-09-30 19:37:13 -07:00
|
|
|
}
|
|
|
|
|
2015-10-15 01:00:14 -07:00
|
|
|
#define NBSP "\xC2\xA0"
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
static const char *tick_sources_name = "tick_sources";
|
|
|
|
static const char *render_displays_name = "render_displays";
|
|
|
|
static const char *output_frame_name = "output_frame";
|
2013-09-30 19:37:13 -07:00
|
|
|
void *obs_video_thread(void *param)
|
|
|
|
{
|
|
|
|
uint64_t last_time = 0;
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
uint64_t interval = video_output_get_frame_time(obs->video.video);
|
2017-05-12 16:21:51 -07:00
|
|
|
uint64_t frame_time_total_ns = 0;
|
2016-08-22 12:04:23 -07:00
|
|
|
uint64_t fps_total_ns = 0;
|
|
|
|
uint32_t fps_total_frames = 0;
|
2013-09-30 19:37:13 -07:00
|
|
|
|
2015-06-04 16:48:56 -07:00
|
|
|
obs->video.video_time = os_gettime_ns();
|
|
|
|
|
2015-01-02 05:36:09 -08:00
|
|
|
os_set_thread_name("libobs: graphics thread");
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
const char *video_thread_name =
|
|
|
|
profile_store_name(obs_get_profiler_name_store(),
|
2015-10-15 01:00:14 -07:00
|
|
|
"obs_video_thread(%g"NBSP"ms)", interval / 1000000.);
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_register_root(video_thread_name, interval);
|
|
|
|
|
2017-10-03 18:48:12 -07:00
|
|
|
srand((unsigned int)time(NULL));
|
|
|
|
|
libobs: Redesign/optimize frame encoding handling
Previously, the design for the interaction between the encoder thread
and the graphics thread was that the encoder thread would signal to the
graphics thread when to start drawing each frame. The original idea
behind this was to prevent mutually cascading stalls of encoding or
graphics rendering (i.e., if rendering took too long, then encoding
would have to catch up, then rendering would have to catch up again, and
so on, cascading upon each other). The ultimate goal was to prevent
encoding from impacting graphics and vise versa.
However, eventually it was realized that there were some fundamental
flaws with this design.
1. Stray frame duplication. You could not guarantee that a frame would
render on time, so sometimes frames would unintentionally be lost if
there was any sort of minor hiccup or if the thread took too long to
be scheduled I'm guessing.
2. Frame timing in the rendering thread was less accurate. The only
place where frame timing was accurate was in the encoder thread, and
the graphics thread was at the whim of thread scheduling. On higher
end computers it was typically fine, but it was just generally not
guaranteed that a frame would be rendered when it was supposed to be
rendered.
So the solution (originally proposed by r1ch and paibox) is to instead
keep the encoding and graphics threads separate as usual, but instead of
the encoder thread controlling the graphics thread, the graphics thread
now controls the encoder thread. The encoder thread keeps a limited
cache of frames, then the graphics thread copies frames in to the cache
and increments a semaphore to schedule the encoder thread to encode that
data.
In the cache, each frame has an encode counter. If the frame cache is
full (e.g., the encoder taking too long to return frames), it will not
cache a new frame, but instead will just increment the counter on the
last frame in the cache to schedule that frame to encode again, ensuring
that frames are on time and reducing CPU usage by lowering video
complexity. If the graphics thread takes too long to render a frame,
then it will add that frame with the count value set to the total amount
of frames that were missed (actual legitimately duplicated frames).
Because the cache gives many frames of breathing room for the encoder to
encode frames, this design helps improve results especially when using
encoding presets that have higher complexity and CPU usage, minimizing
the risk of needlessly skipped or duplicated frames.
I also managed to sneak in what should be a bit of an optimization to
reduce copying of frame data, though how much of an optimization it
ultimately ends up being is debatable.
So to sum it up, this commit increases accuracy of frame timing,
completely removes stray frame duplication, gives better results for
higher complexity encoding presets, and potentially optimizes the frame
pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
|
|
|
while (!video_output_stopped(obs->video.video)) {
|
2017-05-12 16:21:51 -07:00
|
|
|
uint64_t frame_start = os_gettime_ns();
|
|
|
|
uint64_t frame_time_ns;
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(video_thread_name);
|
|
|
|
|
|
|
|
profile_start(tick_sources_name);
|
2015-06-04 16:48:56 -07:00
|
|
|
last_time = tick_sources(obs->video.video_time, last_time);
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(tick_sources_name);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(render_displays_name);
|
2013-10-14 12:37:52 -07:00
|
|
|
render_displays();
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(render_displays_name);
|
2014-02-09 05:51:06 -07:00
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_start(output_frame_name);
|
2015-07-08 14:25:07 +02:00
|
|
|
output_frame();
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(output_frame_name);
|
|
|
|
|
2017-05-12 16:21:51 -07:00
|
|
|
frame_time_ns = os_gettime_ns() - frame_start;
|
|
|
|
|
2015-07-11 08:04:46 +02:00
|
|
|
profile_end(video_thread_name);
|
|
|
|
|
|
|
|
profile_reenable_thread();
|
2015-07-08 14:25:07 +02:00
|
|
|
|
|
|
|
video_sleep(&obs->video, &obs->video.video_time, interval);
|
2016-08-22 12:04:23 -07:00
|
|
|
|
2017-05-12 16:21:51 -07:00
|
|
|
frame_time_total_ns += frame_time_ns;
|
2016-08-22 12:04:23 -07:00
|
|
|
fps_total_ns += (obs->video.video_time - last_time);
|
|
|
|
fps_total_frames++;
|
|
|
|
|
|
|
|
if (fps_total_ns >= 1000000000ULL) {
|
|
|
|
obs->video.video_fps = (double)fps_total_frames /
|
|
|
|
((double)fps_total_ns / 1000000000.0);
|
2017-05-12 16:21:51 -07:00
|
|
|
obs->video.video_avg_frame_time_ns =
|
|
|
|
frame_time_total_ns / (uint64_t)fps_total_frames;
|
|
|
|
|
|
|
|
frame_time_total_ns = 0;
|
2016-08-22 12:04:23 -07:00
|
|
|
fps_total_ns = 0;
|
|
|
|
fps_total_frames = 0;
|
|
|
|
}
|
2013-09-30 19:37:13 -07:00
|
|
|
}
|
|
|
|
|
2014-02-14 15:13:36 -07:00
|
|
|
UNUSED_PARAMETER(param);
|
2013-09-30 19:37:13 -07:00
|
|
|
return NULL;
|
|
|
|
}
|