filter-blur: Switch to Kernel Array instead of Kernel Texture

This speeds up Gaussian Blur and Linear Gaussian Blur drastically reduces time spent reading textures and instead uses existing registers - maximizing time spent reading the actual image texture.

See Also: #21 Blur Quality
This commit is contained in:
Michael Fabian 'Xaymar' Dirks 2018-12-23 02:00:30 +01:00
parent 92c4b54177
commit a6f9451654
3 changed files with 37 additions and 28 deletions

View file

@ -10,9 +10,7 @@ uniform int u_diameter;
uniform float2 u_texelDelta; uniform float2 u_texelDelta;
// Kernel Settings // Kernel Settings
//uniform float registerkernel[25]; uniform float4 kernel[8]; // max kernel radius 31+center.
uniform texture2d kernel;
uniform float2 kernelTexel;
// Bilateral Settings // Bilateral Settings
uniform float bilateralSmoothing; uniform float bilateralSmoothing;
@ -53,6 +51,11 @@ VertDataOut VSDefault(VertDataIn vtx)
return vert_out; return vert_out;
} }
/// Utility
float GetKernelAt(int i) {
return ((float[4])(kernel[floor(i/4)]))[i%4];
}
/// Blur: Box /// Blur: Box
float4 PSBoxBlur(VertDataOut vtx) : TARGET { float4 PSBoxBlur(VertDataOut vtx) : TARGET {
float4 origin = u_image.SampleLevel(pointSampler, vtx.uv, 0); float4 origin = u_image.SampleLevel(pointSampler, vtx.uv, 0);
@ -131,14 +134,13 @@ technique BoxLinear
} }
/// Blur: Gaussian /// Blur: Gaussian
// ToDo: Switch to array Kernel instead of Texture kernel.
float4 PSGaussianBlur(VertDataOut vtx) : TARGET { float4 PSGaussianBlur(VertDataOut vtx) : TARGET {
float2 uvOffset = float2(0, 0); float2 uvOffset = float2(0, 0);
float4 final = u_image.SampleLevel(pointSampler, vtx.uv, 0) float4 final = u_image.SampleLevel(pointSampler, vtx.uv, 0)
* kernel.SampleLevel(pointSampler, (float2(0, u_radius - 1) * kernelTexel), 0).r; * GetKernelAt(0);
for (int k = 1; k <= u_radius; k++) { for (int k = 1; k <= u_radius; k++) {
uvOffset += u_texelDelta; uvOffset += u_texelDelta;
float l_g = kernel.SampleLevel(pointSampler, (float2(k, u_radius - 1) * kernelTexel), 0).r; float l_g = GetKernelAt(k);
float4 l_p = u_image.SampleLevel(pointSampler, vtx.uv + uvOffset, 0); float4 l_p = u_image.SampleLevel(pointSampler, vtx.uv + uvOffset, 0);
float4 l_n = u_image.SampleLevel(pointSampler, vtx.uv - uvOffset, 0); float4 l_n = u_image.SampleLevel(pointSampler, vtx.uv - uvOffset, 0);
final += (l_p + l_n) * l_g; final += (l_p + l_n) * l_g;
@ -182,13 +184,13 @@ float4 PSGaussianLinearBlur(VertDataOut vtx) : TARGET {
// Total Samples: 3 (n+1) // Total Samples: 3 (n+1)
float4 origin = u_image.SampleLevel(pointSampler, vtx.uv, 0); float4 origin = u_image.SampleLevel(pointSampler, vtx.uv, 0);
float4 final = origin * kernel.SampleLevel(pointSampler, (float2(0, u_radius - 1) * kernelTexel), 0).r; float4 final = origin * GetKernelAt(0);
float2 halfTexelDelta = u_texelDelta / 2.0; float2 halfTexelDelta = u_texelDelta / 2.0;
for (int k = 1; k < u_radius; k+=2) { for (int k = 1; k < u_radius; k+=2) {
float2 offset = k * u_texelDelta + halfTexelDelta; float2 offset = k * u_texelDelta + halfTexelDelta;
float l_g0 = kernel.SampleLevel(pointSampler, (float2(k, u_radius - 1) * kernelTexel), 0).r; float l_g0 = GetKernelAt(k);
float l_g1 = kernel.SampleLevel(pointSampler, (float2(k + 1, u_radius - 1) * kernelTexel), 0).r; float l_g1 = GetKernelAt(k +1);
float4 l_p = u_image.SampleLevel(linearSampler, vtx.uv + offset, 0); float4 l_p = u_image.SampleLevel(linearSampler, vtx.uv + offset, 0);
float4 l_n = u_image.SampleLevel(linearSampler, vtx.uv - offset, 0); float4 l_n = u_image.SampleLevel(linearSampler, vtx.uv - offset, 0);
final += (l_p + l_n) * l_g0; final += (l_p + l_n) * l_g0;
@ -199,7 +201,7 @@ float4 PSGaussianLinearBlur(VertDataOut vtx) : TARGET {
// Odd numbers require treatment of ends. // Odd numbers require treatment of ends.
float4 left = u_image.SampleLevel(pointSampler, vtx.uv + u_texelDelta * u_radius, 0); float4 left = u_image.SampleLevel(pointSampler, vtx.uv + u_texelDelta * u_radius, 0);
float4 right = u_image.SampleLevel(pointSampler, vtx.uv - u_texelDelta * u_radius, 0); float4 right = u_image.SampleLevel(pointSampler, vtx.uv - u_texelDelta * u_radius, 0);
float krn = kernel.SampleLevel(pointSampler, (float2(u_radius, u_radius - 1) * kernelTexel), 0).r; float krn = GetKernelAt(u_radius);
final += (left + right) * krn; final += (left + right) * krn;
} }

View file

@ -120,18 +120,12 @@ bool filter::blur::blur_instance::apply_bilateral_param()
return true; return true;
} }
bool filter::blur::blur_instance::apply_gaussian_param() bool filter::blur::blur_instance::apply_gaussian_param(uint8_t width)
{ {
std::shared_ptr<gs::texture> kernel = filter::blur::blur_factory::get()->get_kernel(filter::blur::type::Gaussian); auto kernel = filter::blur::blur_factory::get()->get_gaussian_kernel(width);
if (blur_effect->has_parameter("kernel")) { if (blur_effect->has_parameter("kernel")) {
blur_effect->get_parameter("kernel").set_texture(kernel); blur_effect->get_parameter("kernel").set_float_array(&(kernel->front()), kernel->size());
}
if (blur_effect->has_parameter("kernelTexel")) {
float_t wb = 1.0f / kernel->get_width();
float_t hb = 1.0f / kernel->get_height();
blur_effect->get_parameter("kernelTexel").set_float2(wb, hb);
} }
return true; return true;
@ -604,7 +598,7 @@ void filter::blur::blur_instance::video_render(gs_effect_t* effect)
} }
#pragma endregion RGB->YUV #pragma endregion RGB->YUV
#pragma region blur #pragma region Blur
// Set up camera stuff // Set up camera stuff
gs_set_cull_mode(GS_NEITHER); gs_set_cull_mode(GS_NEITHER);
gs_reset_blend_state(); gs_reset_blend_state();
@ -630,7 +624,7 @@ void filter::blur::blur_instance::video_render(gs_effect_t* effect)
if (!apply_shared_param(intermediate, xpel, ypel)) if (!apply_shared_param(intermediate, xpel, ypel))
break; break;
apply_gaussian_param(); apply_gaussian_param(this->size);
apply_bilateral_param(); apply_bilateral_param();
gs_texrender_reset(rt); gs_texrender_reset(rt);
@ -853,11 +847,13 @@ void filter::blur::blur_factory::generate_gaussian_kernels()
// 2D texture, horizontal is value, vertical is kernel size. // 2D texture, horizontal is value, vertical is kernel size.
size_t size_power_of_two = size_t(pow(2, util::math::get_power_of_two_exponent_ceil(max_kernel_size))); size_t size_power_of_two = size_t(pow(2, util::math::get_power_of_two_exponent_ceil(max_kernel_size)));
std::vector<float_t> texture_Data(size_power_of_two * size_power_of_two); std::vector<float_t> texture_data(size_power_of_two * size_power_of_two);
std::vector<float_t> math_data(size_power_of_two); std::vector<float_t> math_data(size_power_of_two);
std::shared_ptr<std::vector<float_t>> kernel_data;
for (size_t width = 1; width <= max_kernel_size; width++) { for (size_t width = 1; width <= max_kernel_size; width++) {
size_t v = (width - 1) * size_power_of_two; size_t v = (width - 1) * size_power_of_two;
kernel_data = std::make_shared<std::vector<float_t>>(size_power_of_two);
// Calculate and normalize // Calculate and normalize
float_t sum = 0; float_t sum = 0;
@ -869,13 +865,16 @@ void filter::blur::blur_factory::generate_gaussian_kernels()
// Normalize to Texture Buffer // Normalize to Texture Buffer
double_t inverse_sum = 1.0 / sum; double_t inverse_sum = 1.0 / sum;
for (size_t p = 0; p <= width; p++) { for (size_t p = 0; p <= width; p++) {
texture_Data[v + p] = float_t(math_data[p] * inverse_sum); texture_data[v + p] = float_t(math_data[p] * inverse_sum);
kernel_data->at(p) = texture_data[v + p];
} }
gaussian_kernels.insert({uint8_t(width), kernel_data});
} }
// Create Texture // Create Texture
try { try {
auto texture_buffer = reinterpret_cast<uint8_t*>(texture_Data.data()); auto texture_buffer = reinterpret_cast<uint8_t*>(texture_data.data());
auto unsafe_buffer = const_cast<const uint8_t**>(&texture_buffer); auto unsafe_buffer = const_cast<const uint8_t**>(&texture_buffer);
kernels.insert_or_assign(filter::blur::type::Gaussian, kernels.insert_or_assign(filter::blur::type::Gaussian,
@ -1046,6 +1045,11 @@ std::shared_ptr<gs::texture> filter::blur::blur_factory::get_kernel(filter::blur
return kernels.at(type); return kernels.at(type);
} }
std::shared_ptr<std::vector<float_t>> filter::blur::blur_factory::get_gaussian_kernel(uint8_t size)
{
return gaussian_kernels.at(size);
}
obs_scene_t* filter::blur::blur_factory::get_scene(std::string name) obs_scene_t* filter::blur::blur_factory::get_scene(std::string name)
{ {
auto kv = scenes.find(name); auto kv = scenes.find(name);

View file

@ -111,7 +111,7 @@ namespace filter {
bool apply_shared_param(gs_texture_t* input, float texelX, float texelY); bool apply_shared_param(gs_texture_t* input, float texelX, float texelY);
bool apply_bilateral_param(); bool apply_bilateral_param();
bool apply_gaussian_param(); bool apply_gaussian_param(uint8_t width);
bool apply_mask_parameters(std::shared_ptr<gs::effect> effect, gs_texture_t* original_texture, bool apply_mask_parameters(std::shared_ptr<gs::effect> effect, gs_texture_t* original_texture,
gs_texture_t* blurred_texture); gs_texture_t* blurred_texture);
@ -143,6 +143,7 @@ namespace filter {
std::shared_ptr<gs::effect> blur_effect; std::shared_ptr<gs::effect> blur_effect;
std::map<filter::blur::type, std::shared_ptr<gs::texture>> kernels; std::map<filter::blur::type, std::shared_ptr<gs::texture>> kernels;
std::map<uint8_t, std::shared_ptr<std::vector<float_t>>> gaussian_kernels;
std::map<std::string, obs_scene_t*> scenes; std::map<std::string, obs_scene_t*> scenes;
@ -188,6 +189,8 @@ namespace filter {
std::shared_ptr<gs::texture> get_kernel(filter::blur::type type); std::shared_ptr<gs::texture> get_kernel(filter::blur::type type);
std::shared_ptr<std::vector<float_t>> get_gaussian_kernel(uint8_t size);
obs_scene_t* get_scene(std::string name); obs_scene_t* get_scene(std::string name);
void enum_scenes(std::function<bool(obs_scene_t*)> fnc); void enum_scenes(std::function<bool(obs_scene_t*)> fnc);