From: Christian Lindig Subject: Simplify UTF-8 decoding * Use the decoder from the OCaml standard library instead of our own implementation, which this patch removes. * Validate UTF-8/XML conformance for maps and sets, in addition to strings. This is XSA-474 / CVE-2025-58146. Signed-off-by: Christian Lindig Reviewed-by: Edwin Török diff --git a/ocaml/database/db_cache_impl.ml b/ocaml/database/db_cache_impl.ml index e9745749a..050d43f05 100644 --- a/ocaml/database/db_cache_impl.ml +++ b/ocaml/database/db_cache_impl.ml @@ -67,9 +67,7 @@ let read_field t tblname fldname objref = occurs. *) let ensure_utf8_xml string = let length = String.length string in - let prefix = - Xapi_stdext_encodings.Encodings.UTF8_XML.longest_valid_prefix string - in + let prefix = Xapi_stdext_encodings.Utf8.XML.longest_valid_prefix string in if length > String.length prefix then warn "string truncated to: '%s'." prefix ; prefix @@ -86,20 +84,32 @@ let write_field_locked t tblname objref fldname newval = (get_database t) ) +(** Ensure a value is conforming to UTF-8 with XML restrictions *) +let is_valid v = + let valid = Xapi_stdext_encodings.Utf8.XML.is_valid in + let valid_pair (x, y) = valid x && valid y in + match v with + | Schema.Value.String s -> + valid s + | Schema.Value.Set ss -> + List.for_all valid ss + | Schema.Value.Pairs pairs -> + List.for_all valid_pair pairs + +let share_string = function + | Schema.Value.String s -> + Schema.Value.String (Share.merge s) + | v -> + (* we assume strings in the tree have been shared already *) + v + let write_field t tblname objref fldname newval = - let newval = - match newval with - | Schema.Value.String s -> - (* the other caller of write_field_locked only uses sets and maps, - so we only need to check for String here - *) - if not (Xapi_stdext_encodings.Encodings.UTF8_XML.is_valid s) then - raise Invalid_value ; - Schema.Value.String (Share.merge s) - | _ -> - newval - in - with_lock (fun () -> write_field_locked t tblname objref fldname newval) + if not @@ is_valid newval then + raise Invalid_value + else + with_lock (fun () -> + write_field_locked t tblname objref fldname (share_string newval) + ) let touch_row t tblname objref = update_database t (touch tblname objref) ; diff --git a/ocaml/database/string_marshall_helper.ml b/ocaml/database/string_marshall_helper.ml index ba003bee9..1add3aef7 100644 --- a/ocaml/database/string_marshall_helper.ml +++ b/ocaml/database/string_marshall_helper.ml @@ -22,9 +22,7 @@ module D = Debug.Make (struct let name = __MODULE__ end) let ensure_utf8_xml string = let length = String.length string in - let prefix = - Xapi_stdext_encodings.Encodings.UTF8_XML.longest_valid_prefix string - in + let prefix = Xapi_stdext_encodings.Utf8.XML.longest_valid_prefix string in if length > String.length prefix then D.warn "Whilst doing 'set' of structured field, string truncated to: '%s'." prefix ; diff --git a/ocaml/idl/ocaml_backend/gen_server.ml b/ocaml/idl/ocaml_backend/gen_server.ml index 191450212..f95f5f6d9 100644 --- a/ocaml/idl/ocaml_backend/gen_server.ml +++ b/ocaml/idl/ocaml_backend/gen_server.ml @@ -457,7 +457,7 @@ let gen_module api : O.Module.t = ([ "let __call, __params = call.Rpc.name, call.Rpc.params in" ; "List.iter (fun p -> let s = Rpc.to_string p in if not \ - (Xapi_stdext_encodings.Encodings.UTF8_XML.is_valid s) then" + (Xapi_stdext_encodings.Utf8.is_valid s) then" ; "raise (Api_errors.Server_error(Api_errors.invalid_value, \ [\"Invalid UTF-8 string in parameter\"; s]))) __params;" ; "let __label = __call in" diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml index 7308c756d..bb20eed4f 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml @@ -1,5 +1,5 @@ open Bechamel -open Xapi_stdext_encodings.Encodings +open Xapi_stdext_encodings let test name f = Test.make_indexed_with_resource ~name ~args:[10; 1000; 10000] @@ -10,6 +10,6 @@ let test name f = let benchmarks = Test.make_grouped ~name:"Encodings.validate" - [test "UTF8_XML" UTF8_XML.validate] + [test "UTF8.XML" Utf8.XML.is_valid] let () = Bechamel_simple_cli.cli benchmarks diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune index 742dd212f..839346e35 100644 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune @@ -1,12 +1,6 @@ (library (name xapi_stdext_encodings) (public_name xapi-stdext-encodings) - (modules :standard \ test) + (modules :standard) ) -(test - (name test) - (package xapi-stdext-encodings) - (modules test) - (libraries alcotest xapi-stdext-encodings) -) diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml deleted file mode 100644 index 2dfd45a7d..000000000 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml +++ /dev/null @@ -1,167 +0,0 @@ -(* - * Copyright (C) 2006-2009 Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) -exception UCS_value_out_of_range - -exception UCS_value_prohibited_in_UTF8 - -exception UCS_value_prohibited_in_XML - -exception UTF8_character_incomplete - -exception UTF8_header_byte_invalid - -exception UTF8_continuation_byte_invalid - -exception UTF8_encoding_not_canonical - -exception String_incomplete - -(* === Unicode Functions === *) - -module UCS = struct - let is_non_character value = - false - || (0xfdd0 <= value && value <= 0xfdef) (* case 1 *) - || Int.logand 0xfffe value = 0xfffe - (* case 2 *) - [@@inline] -end - -module XML = struct - let is_illegal_control_character value = - let value = Uchar.to_int value in - value < 0x20 && value <> 0x09 && value <> 0x0a && value <> 0x0d - [@@inline] -end - -(* === UCS Validators === *) - -module type UCS_VALIDATOR = sig - val validate : Uchar.t -> unit -end - -module UTF8_UCS_validator = struct - let validate value = - if (UCS.is_non_character [@inlined]) (Uchar.to_int value) then - raise UCS_value_prohibited_in_UTF8 - [@@inline] -end - -module XML_UTF8_UCS_validator = struct - let validate value = - (UTF8_UCS_validator.validate [@inlined]) value ; - if (XML.is_illegal_control_character [@inlined]) value then - raise UCS_value_prohibited_in_XML -end - -(* === String Validators === *) - -module type STRING_VALIDATOR = sig - val is_valid : string -> bool - - val validate : string -> unit - - val longest_valid_prefix : string -> string -end - -exception Validation_error of int * exn - -module UTF8_XML : STRING_VALIDATOR = struct - let decode_continuation_byte byte = - if byte land 0b11000000 = 0b10000000 then - byte land 0b00111111 - else - raise UTF8_continuation_byte_invalid - - let rec decode_continuation_bytes string last value index = - if index <= last then - let chunk = decode_continuation_byte (Char.code string.[index]) in - let value = (value lsl 6) lor chunk in - decode_continuation_bytes string last value (index + 1) - else - value - - let validate_character_utf8 string byte index = - let value, width = - if byte land 0b10000000 = 0b00000000 then - (byte, 1) - else if byte land 0b11100000 = 0b11000000 then - (byte land 0b0011111, 2) - else if byte land 0b11110000 = 0b11100000 then - (byte land 0b0001111, 3) - else if byte land 0b11111000 = 0b11110000 then - (byte land 0b0000111, 4) - else - raise UTF8_header_byte_invalid - in - let value = - if width = 1 then - value - else - decode_continuation_bytes string (index + width - 1) value (index + 1) - in - XML_UTF8_UCS_validator.validate (Uchar.unsafe_of_int value) ; - width - - let rec validate_aux string length index = - if index = length then - () - else - let width = - try - let byte = string.[index] |> Char.code in - validate_character_utf8 string byte index - with - | Invalid_argument _ -> - raise String_incomplete - | error -> - raise (Validation_error (index, error)) - in - validate_aux string length (index + width) - - let validate string = validate_aux string (String.length string) 0 - - let rec validate_with_fastpath string stop pos = - if pos < stop then - (* the compiler is smart enough to optimize the 'int32' away here, - and not allocate *) - let i32 = String.get_int32_ne string pos |> Int32.to_int in - (* test that for all bytes 0x20 <= byte < 0x80. - If any is <0x20 it would cause a negative value to appear in that byte, - which we can detect if we use 0x80 as a mask. - Byte >= 0x80 can be similarly detected with a mask of 0x80 on each byte. - We don't want to see a 0x80 from either of these, hence we bitwise or the 2 values together. - *) - if i32 lor (i32 - 0x20_20_20_20) land 0x80_80_80_80 = 0 then - validate_with_fastpath string stop (pos + 4) - else (* when the condition doesn't hold fall back to full UTF8 decoder *) - validate_aux string (String.length string) pos - else - validate_aux string (String.length string) pos - - let validate_with_fastpath string = - validate_with_fastpath string (String.length string - 3) 0 - - let validate = - if Sys.word_size = 64 then - validate_with_fastpath - else - validate - - let is_valid string = try validate string ; true with _ -> false - - let longest_valid_prefix string = - try validate string ; string - with Validation_error (index, _) -> String.sub string 0 index -end diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli deleted file mode 100644 index 2a139ae37..000000000 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli +++ /dev/null @@ -1,84 +0,0 @@ -(* - * Copyright (C) 2006-2009 Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) - -(** Encoding helper modules *) - -(** {2 Exceptions} *) - -exception UCS_value_out_of_range - -exception UCS_value_prohibited_in_UTF8 - -exception UCS_value_prohibited_in_XML - -exception UTF8_character_incomplete - -exception UTF8_header_byte_invalid - -exception UTF8_continuation_byte_invalid - -exception UTF8_encoding_not_canonical - -exception String_incomplete - -(** {2 UCS Validators} *) - -(** Validates UCS character values. *) -module type UCS_VALIDATOR = sig - val validate : Uchar.t -> unit -end - -(** Accepts all values within the UCS character value range except - * those which are invalid for all UTF-8-encoded XML documents. *) -module XML_UTF8_UCS_validator : UCS_VALIDATOR - -module XML : sig - val is_illegal_control_character : Uchar.t -> bool - (** Returns true if and only if the given value corresponds to - * a illegal control character as defined in section 2.2 of - * the XML specification, version 1.0. *) -end - -(** {2 String Validators} *) - -(** Provides functionality for validating and processing - * strings according to a particular character encoding. *) -module type STRING_VALIDATOR = sig - val is_valid : string -> bool - (** Returns true if and only if the given string is validly-encoded. *) - - val validate : string -> unit - (** Raises an encoding error if the given string is not validly-encoded. *) - - val longest_valid_prefix : string -> string - (** Returns the longest validly-encoded prefix of the given string. *) -end - -(** Represents a validation error as a tuple [(i,e)], where: - * [i] = the index of the first non-compliant character; - * [e] = the reason for non-compliance. *) -exception Validation_error of int * exn - -(** Provides functions for validating and processing - * strings according to the UTF-8 character encoding, - * with certain additional restrictions on UCS values - * imposed by the XML specification. - * - * Validly-encoded strings must satisfy both RFC 3629 - * and section 2.2 of the XML specification. - * - * For further information, see: - * http://www.rfc.net/rfc3629.html - * http://www.w3.org/TR/REC-xml/#charsets *) -module UTF8_XML : STRING_VALIDATOR diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml deleted file mode 100644 index 9cc75b297..000000000 --- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml +++ /dev/null @@ -1,533 +0,0 @@ -(* - * Copyright (C) 2006-2009 Citrix Systems Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - *) -module E = Xapi_stdext_encodings.Encodings - -(* Pull in the infix operators from Encodings used in this test *) -let ( --- ), ( +++ ), ( <<< ) = (Int.sub, Int.add, Int.shift_left) - -(* === Mock exceptions ==================================================== *) - -(** Simulates a decoding error. *) -exception Decode_error - -(* === Mock UCS validators ================================================= *) - -(** A validator that always succeeds. *) -module Lenient_UCS_validator : E.UCS_VALIDATOR = struct - let validate _ = () -end - -(* === Mock character validators ============================================= *) - -(** A validator that succeeds for all characters. *) -module Universal_character_validator = struct - let validate _ = () -end - -(** A validator that fails for all characters. *) -module Failing_character_validator = struct - let validate _ = raise Decode_error -end - -(** A validator that succeeds for all characters except the letter 'F'. *) -module Selective_character_validator = struct - let validate uchar = - if Uchar.equal uchar (Uchar.of_char 'F') then raise Decode_error -end - -(* === Test helpers ======================================================== *) - -let assert_true = Alcotest.(check bool) "true" true - -let assert_false = Alcotest.(check bool) "false" false - -let assert_raises_match exception_match fn = - try - fn () ; - Alcotest.fail "assert_raises_match: failure expected" - with failure -> - if not (exception_match failure) then - raise failure - else - () - -(* === Mock codecs ========================================================= *) - -module UCS = struct - (* === Unicode Functions === *) - let min_value = 0x000000 - - let max_value = 0x10ffff - (* used to be 0x1fffff, but this changed and Unicode won't allocate larger than 0x10ffff *) - - let is_non_character value = - false - || (0xfdd0 <= value && value <= 0xfdef) (* case 1 *) - || Int.logand 0xfffe value = 0xfffe - (* case 2 *) - - let is_out_of_range value = value < min_value || value > max_value - - let is_surrogate value = 0xd800 <= value && value <= 0xdfff - - (** A list of UCS non-characters values, including: - a. non-characters within the basic multilingual plane; - b. non-characters at the end of the basic multilingual plane; - c. non-characters at the end of the private use area. *) - let non_characters = - [ - 0x00fdd0 - ; 0x00fdef - ; (* case a. *) - 0x00fffe - ; 0x00ffff - ; (* case b. *) - 0x1ffffe - ; 0x1fffff (* case c. *) - ] - - (** A list of UCS character values located immediately before or - after UCS non-character values, including: - a. non-characters within the basic multilingual plane; - b. non-characters at the end of the basic multilingual plane; - c. non-characters at the end of the private use area. *) - let valid_characters_next_to_non_characters = - [ - 0x00fdcf - ; 0x00fdf0 - ; (* case a. *) - 0x00fffd - ; 0x010000 - ; (* case b. *) - 0x1ffffd - ; 0x200000 (* case c. *) - ] - - let test_is_non_character () = - List.iter (fun value -> assert_true (is_non_character value)) non_characters ; - List.iter - (fun value -> assert_false (is_non_character value)) - valid_characters_next_to_non_characters - - let test_is_out_of_range () = - assert_true (is_out_of_range (min_value --- 1)) ; - assert_false (is_out_of_range min_value) ; - assert_false (is_out_of_range max_value) ; - assert_true (is_out_of_range (max_value +++ 1)) - - let test_is_surrogate () = - assert_false (is_surrogate 0xd7ff) ; - assert_true (is_surrogate 0xd800) ; - assert_true (is_surrogate 0xdfff) ; - assert_false (is_surrogate 0xe000) - - let tests = - [ - ("test_is_non_character", `Quick, test_is_non_character) - ; ("test_is_out_of_range", `Quick, test_is_out_of_range) - ; ("test_is_surrogate", `Quick, test_is_surrogate) - ] -end - -module Lenient_UTF8_codec = struct - let decode_header_byte byte = - if byte land 0b10000000 = 0b00000000 then - (byte, 1) - else if byte land 0b11100000 = 0b11000000 then - (byte land 0b0011111, 2) - else if byte land 0b11110000 = 0b11100000 then - (byte land 0b0001111, 3) - else if byte land 0b11111000 = 0b11110000 then - (byte land 0b0000111, 4) - else - raise E.UTF8_header_byte_invalid - - let decode_continuation_byte byte = - if byte land 0b11000000 = 0b10000000 then - byte land 0b00111111 - else - raise E.UTF8_continuation_byte_invalid - - let width_required_for_ucs_value value = - if value < 0x000080 (* 1 lsl 7 *) then - 1 - else if value < 0x000800 (* 1 lsl 11 *) then - 2 - else if value < 0x010000 (* 1 lsl 16 *) then - 3 - else - 4 - - let decode_character string index = - let value, width = decode_header_byte (Char.code string.[index]) in - let value = - if width = 1 then - value - else - let value = ref value in - for index = index + 1 to index + width - 1 do - let chunk = decode_continuation_byte (Char.code string.[index]) in - value := (!value lsl 6) lor chunk - done ; - if width > width_required_for_ucs_value !value then - raise E.UTF8_encoding_not_canonical ; - !value - in - (value, width) -end - -(* === Mock string validators ============================================== *) -module Mock_String_validator (Validator : E.UCS_VALIDATOR) : - E.STRING_VALIDATOR = struct - (* no longer a functor in Encodings for performance reasons, - so modify the original string passed as argument instead replacing - characters that would be invalid with a known invalid XML char: 0x0B. - *) - - let transform str = - let b = Buffer.create (String.length str) in - let rec loop pos = - if pos < String.length str then - let value, width = Lenient_UTF8_codec.decode_character str pos in - let () = - try - let u = Uchar.of_int value in - Validator.validate u ; Buffer.add_utf_8_uchar b u - with _ -> Buffer.add_char b '\x0B' - in - loop (pos + width) - in - loop 0 ; Buffer.contents b - - let is_valid str = E.UTF8_XML.is_valid (transform str) - - let validate str = - try E.UTF8_XML.validate (transform str) - with E.Validation_error (pos, _) -> - raise (E.Validation_error (pos, Decode_error)) - - let longest_valid_prefix str = E.UTF8_XML.longest_valid_prefix (transform str) -end - -(** A validator that accepts all strings. *) -module Universal_string_validator = - Mock_String_validator (Universal_character_validator) - -(** A validator that rejects all strings. *) -module Failing_string_validator = - Mock_String_validator (Failing_character_validator) - -(** A validator that rejects strings containing the character 'F'. *) -module Selective_string_validator = - Mock_String_validator (Selective_character_validator) - -(* === Tests =============================================================== *) - -module String_validator = struct - let test_is_valid () = - assert_true (Universal_string_validator.is_valid "") ; - assert_true (Universal_string_validator.is_valid "123456789") ; - assert_true (Selective_string_validator.is_valid "") ; - assert_true (Selective_string_validator.is_valid "123456789") ; - assert_false (Selective_string_validator.is_valid "F23456789") ; - assert_false (Selective_string_validator.is_valid "1234F6789") ; - assert_false (Selective_string_validator.is_valid "12345678F") ; - assert_false (Selective_string_validator.is_valid "FFFFFFFFF") - - let test_longest_valid_prefix () = - Alcotest.(check string) - "prefix" - (Universal_string_validator.longest_valid_prefix "") - "" ; - Alcotest.(check string) - "prefix" - (Universal_string_validator.longest_valid_prefix "123456789") - "123456789" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "") - "" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "123456789") - "123456789" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "F23456789") - "" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "1234F6789") - "1234" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "12345678F") - "12345678" ; - Alcotest.(check string) - "prefix" - (Selective_string_validator.longest_valid_prefix "FFFFFFFFF") - "" - - (** Tests that validation does not fail for an empty string. *) - let test_validate_with_empty_string () = E.UTF8_XML.validate "" - - let test_validate_with_incomplete_string () = - Alcotest.check_raises "Validation fails correctly for an incomplete string" - E.String_incomplete (fun () -> E.UTF8_XML.validate "\xc2" - ) - - let test_validate_with_failing_decoders () = - Failing_string_validator.validate "" ; - assert_raises_match - (function E.Validation_error (0, Decode_error) -> true | _ -> false) - (fun () -> Selective_string_validator.validate "F") ; - assert_raises_match - (function E.Validation_error (0, Decode_error) -> true | _ -> false) - (fun () -> Selective_string_validator.validate "F12345678") ; - assert_raises_match - (function E.Validation_error (4, Decode_error) -> true | _ -> false) - (fun () -> Selective_string_validator.validate "0123F5678") ; - assert_raises_match - (function E.Validation_error (8, Decode_error) -> true | _ -> false) - (fun () -> Selective_string_validator.validate "01234567F") ; - assert_raises_match - (function E.Validation_error (0, Decode_error) -> true | _ -> false) - (fun () -> Selective_string_validator.validate "FFFFFFFFF") - - let tests = - [ - ("test_is_valid", `Quick, test_is_valid) - ; ("test_longest_valid_prefix", `Quick, test_longest_valid_prefix) - ; ( "test_validate_with_empty_string" - , `Quick - , test_validate_with_empty_string - ) - ; ( "test_validate_with_incomplete_string" - , `Quick - , test_validate_with_incomplete_string - ) - ; ( "test_validate_with_failing_decoders" - , `Quick - , test_validate_with_failing_decoders - ) - ] -end - -module XML = struct - include E.XML - - let test_is_illegal_control_character () = - assert_true (is_illegal_control_character (Uchar.of_int 0x00)) ; - assert_true (is_illegal_control_character (Uchar.of_int 0x19)) ; - assert_false (is_illegal_control_character (Uchar.of_int 0x09)) ; - assert_false (is_illegal_control_character (Uchar.of_int 0x0a)) ; - assert_false (is_illegal_control_character (Uchar.of_int 0x0d)) ; - assert_false (is_illegal_control_character (Uchar.of_int 0x20)) - - let tests = - [ - ( "test_is_illegal_control_character" - , `Quick - , test_is_illegal_control_character - ) - ] -end - -(** Tests the XML-specific UTF-8 UCS validation function. *) -module XML_UTF8_UCS_validator = struct - include E.XML_UTF8_UCS_validator - - let validate uchar = - if Uchar.is_valid uchar then - validate @@ Uchar.of_int uchar - else if uchar < Uchar.to_int Uchar.min || uchar > Uchar.to_int Uchar.max - then - raise E.UCS_value_out_of_range - else - raise E.UCS_value_prohibited_in_UTF8 - - let test_validate () = - let value = ref (UCS.min_value --- 1) in - while !value <= UCS.max_value +++ 1 do - if UCS.is_out_of_range !value then - Alcotest.check_raises "should fail" E.UCS_value_out_of_range (fun () -> - validate !value - ) - else if UCS.is_non_character !value || UCS.is_surrogate !value then - Alcotest.check_raises "should fail" E.UCS_value_prohibited_in_UTF8 - (fun () -> validate !value - ) - else if - Uchar.is_valid !value - && XML.is_illegal_control_character (Uchar.of_int !value) - then - Alcotest.check_raises "should fail" E.UCS_value_prohibited_in_XML - (fun () -> validate !value - ) - else - validate !value ; - value := !value +++ 1 - done - - let tests = [("test_validate", `Quick, test_validate)] -end - -module UTF8_codec = struct - (** A list of canonical encoding widths of UCS values, - represented by tuples of the form (v, w), where: - v = the UCS character value to be encoded; and - w = the width of the encoded character, in bytes. *) - let valid_ucs_value_widths = - [ - (1, 1) - ; ((1 <<< 7) --- 1, 1) - ; (1 <<< 7, 2) - ; ((1 <<< 11) --- 1, 2) - ; (1 <<< 11, 3) - ; ((1 <<< 16) --- 1, 3) - ; (1 <<< 16, 4) - ; ((1 <<< 21) --- 1, 4) - ] - - let width_required_for_ucs_value value = - if value < 0x000080 (* 1 lsl 7 *) then - 1 - else if value < 0x000800 (* 1 lsl 11 *) then - 2 - else if value < 0x010000 (* 1 lsl 16 *) then - 3 - else - 4 - - let test_width_required_for_ucs_value () = - List.iter - (fun (value, width) -> - Alcotest.(check int) - "same ints" - (width_required_for_ucs_value value) - width - ) - valid_ucs_value_widths - - (** A list of valid character decodings represented by - tuples of the form (s, (v, w)), where: - - s = a validly-encoded UTF-8 string; - v = the UCS value represented by the string; - (which may or may not be valid in its own right) - w = the width of the encoded string, in bytes. - - For each byte length b in [1...4], the list contains - decodings for: - - v_min = the smallest UCS value encodable in b bytes. - v_max = the greatest UCS value encodable in b bytes. *) - let valid_character_decodings = - [ - (* 7654321 *) - (* 0b0xxxxxxx *) - (* 00000000000000xxxxxxx *) - ( "\x00" (* 0b00000000 *) - , (0b000000000000000000000, 1) - ) - ; ( "\x7f" (* 0b01111111 *) - , (0b000000000000001111111, 1) - ) - ; (* 10987654321 *) - (* 0b110xxxsx 0b10xxxxxx *) - (* 0000000000xxxsxxxxxxx *) - ( "\xc2\x80" (* 0b11000010 0b10000000 *) - , (0b000000000000010000000, 2) - ) - ; ( "\xdf\xbf" (* 0b11011111 0b10111111 *) - , (0b000000000011111111111, 2) - ) - ; (* 6543210987654321 *) - (* 0b1110xxxx 0b10sxxxxx 0b10xxxxxx *) - (* xxxxsxxxxxxxxxxx *) - ( "\xe0\xa0\x80" (* 0b11100000 0b10100000 0b10000000 *) - , (0b000000000100000000000, 3) - ) - ; ( "\xef\xbf\xbf" (* 0b11101111 0b10111111 0b10111111 *) - , (0b000001111111111111111, 3) - ) - ; (* 109876543210987654321 *) - (* 0b11110xxx 0b10xsxxxx 0b10xxxxxx 0b10xxxxxx *) - (* xxxxsxxxxxxxxxxxxxxxx *) - ( "\xf0\x90\x80\x80" (* 0b11110000 0b10010000 0b10000000 0b10000000 *) - , (0b000010000000000000000, 4) - ) - ; ( "\xf7\xbf\xbf\xbf" (* 0b11110111 0b10111111 0b10111111 0b10111111 *) - , (0b111111111111111111111, 4) - ) - ] - - let uchar = Alcotest.int - - let test_decode_character_when_valid () = - List.iter - (fun (string, (value, width)) -> - Alcotest.(check (pair uchar int)) - "same pair" - (Lenient_UTF8_codec.decode_character string 0) - (value, width) - ) - valid_character_decodings - - (** A list of strings containing overlong character encodings. - For each byte length b in [2...4], this list contains the - overlong encoding e (v), where v is the UCS value one less - than the smallest UCS value validly-encodable in b bytes. *) - let overlong_character_encodings = - [ - "\xc1\xbf" (* 0b11000001 0b10111111 *) - ; "\xe0\x9f\xbf" (* 0b11100000 0b10011111 0b10111111 *) - ; "\xf0\x8f\xbf\xbf" (* 0b11110000 0b10001111 0b10111111 0b10111111 *) - ] - - let test_decode_character_when_overlong () = - List.iter - (fun string -> - Alcotest.check_raises "should fail" E.UTF8_encoding_not_canonical - (fun () -> Lenient_UTF8_codec.decode_character string 0 |> ignore - ) - ) - overlong_character_encodings - - let tests = - [ - ( "test_width_required_for_ucs_value" - , `Quick - , test_width_required_for_ucs_value - ) - ; ( "test_decode_character_when_valid" - , `Quick - , test_decode_character_when_valid - ) - ; ( "test_decode_character_when_overlong" - , `Quick - , test_decode_character_when_overlong - ) - ] -end - -let () = - Alcotest.run "Encodings" - [ - ("UCS", UCS.tests) - ; ("XML", XML.tests) - ; ("String_validator", String_validator.tests) - ; ("XML_UTF8_UCS_validator", XML_UTF8_UCS_validator.tests) - ; ("UTF8_codec", UTF8_codec.tests) - ] diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml new file mode 100644 index 000000000..d17d85b3b --- /dev/null +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml @@ -0,0 +1,74 @@ +(* + * Copyright (c) Cloud Software Group, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +let is_valid = String.is_valid_utf_8 + +(* deprecated - reject invalid UTF-8 *) +let longest_valid_prefix str = + let len = String.length str in + let rec loop = function + | i when i < len -> + let dec = String.get_utf_8_uchar str i in + if Uchar.utf_decode_is_valid dec then + loop (i + Uchar.utf_decode_length dec) + else + String.sub str 0 i + | i when i = len -> + str + | i -> + String.sub str 0 i (* never reached *) + in + loop 0 + +module XML = struct + (** some UTF-8 characters are not legal in XML. Assuming uchar is + legal UTF-8, further check that it is legal in XML *) + let is_legal uchar = + let uchar = Uchar.to_int uchar in + uchar >= 0x20 || uchar = 0x09 || uchar = 0x0a || uchar = 0x0d + [@@inline] + + let is_valid str = + let len = String.length str in + let rec loop = function + | i when i < len -> + let dec = String.get_utf_8_uchar str i in + Uchar.utf_decode_is_valid dec + && is_legal (Uchar.utf_decode_uchar dec) + && loop (i + Uchar.utf_decode_length dec) + | _ -> + true + in + loop 0 + + (* deprecated - reject invalid UTF-8 *) + let longest_valid_prefix str = + let len = String.length str in + let rec loop = function + | i when i < len -> + let dec = String.get_utf_8_uchar str i in + if + Uchar.utf_decode_is_valid dec + && is_legal (Uchar.utf_decode_uchar dec) + then + loop (i + Uchar.utf_decode_length dec) + else + String.sub str 0 i + | i when i = len -> + str (* avoid copy *) + | i -> + String.sub str 0 i (* never reached *) + in + loop 0 +end diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli new file mode 100644 index 000000000..6d8949e2f --- /dev/null +++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli @@ -0,0 +1,31 @@ +(* + * Copyright (c) Cloud Software Group, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + *) + +val is_valid : string -> bool +(** true, if a string is a proper UTF-8 string *) + +val longest_valid_prefix : string -> string +(** Deprecated. Longest prefix of a string that is proper UTF-8 *) + +(* strings in XML are more restricted than UTF-8 in general. The must be + valid UTF-8 and must not contain certain characters *) + +module XML : sig + val is_valid : string -> bool + (** true, if a string is a proper UTF-8 string in XML *) + + val longest_valid_prefix : string -> string + (** Deprecated. longest prefix of a string that is proper UTF-8. + Better reject invalid UTF-8. *) +end diff --git a/ocaml/xapi/xapi_message.ml b/ocaml/xapi/xapi_message.ml index 408ba7acf..4c08648dc 100644 --- a/ocaml/xapi/xapi_message.ml +++ b/ocaml/xapi/xapi_message.ml @@ -28,7 +28,7 @@ *) module Date = Clock.Date -module Encodings = Xapi_stdext_encodings.Encodings +module Encodings = Xapi_stdext_encodings module Listext = Xapi_stdext_std.Listext module Pervasiveext = Xapi_stdext_pervasives.Pervasiveext module Unixext = Xapi_stdext_unix.Unixext @@ -414,7 +414,7 @@ let create ~__context ~name ~priority ~cls ~obj_uuid ~body = debug "Message.create %s %Ld %s %s" name priority (Record_util.cls_to_string cls) obj_uuid ; - if not (Encodings.UTF8_XML.is_valid body) then + if not (Encodings.Utf8.is_valid body) then raise (Api_errors.Server_error (Api_errors.invalid_value, ["UTF8 expected"])) ; if not (check_uuid ~__context ~cls ~uuid:obj_uuid) then raise