diff --git a/doc/content/design/numa.md b/doc/content/design/numa.md index fa1917b3c57..b9eb0adabe0 100644 --- a/doc/content/design/numa.md +++ b/doc/content/design/numa.md @@ -112,7 +112,7 @@ This function receives as arguments a domain ID and the number of nodes this domain is using (acquired using `domain_get_numa_info_node_pages`) The number of NUMA nodes of the host (not domain) is reported by -`Xenctrl.physinfo` which returns a value of type `physinfo`. +`Xenctrlext.physinfo` which returns a value of type `physinfo`. ```diff index b4579862ff..491bd3fc73 100644 diff --git a/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md index ba4274e243a..e7d407f0e0b 100644 --- a/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md +++ b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md @@ -64,7 +64,7 @@ to call: [wait_xen_free_mem](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272) to wait (if necessary), for the Xen memory scrubber to catch up reclaiming memory. It - 1. calls `Xenctrl.physinfo` which returns: + 1. calls `Xenctrlext.physinfo` which returns: - `hostinfo.free_pages` - the free and already scrubbed pages (available) - `host.scrub_pages` - the not yet scrubbed pages (not yet available) 2. repeats this until a timeout as long as `free_pages` is *lower* diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.ml b/ocaml/libs/xenctrl-ext/xenctrlext.ml index 8922e490465..b481f36468c 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.ml +++ b/ocaml/libs/xenctrl-ext/xenctrlext.ml @@ -131,3 +131,8 @@ let domain_claim_pages handle domid ?(numa_node = NumaNode.none) nr_pages = let get_nr_nodes handle = let info = numainfo handle in Array.length info.memory + +let physinfo xc = + let info = Xenctrl.physinfo xc in + let emergency_reserve_pages = Nativeint.shift_left 1n 16 in + {info with free_pages= Nativeint.sub info.free_pages emergency_reserve_pages} diff --git a/ocaml/libs/xenctrl-ext/xenctrlext.mli b/ocaml/libs/xenctrl-ext/xenctrlext.mli index f9b8b49bb8e..e6bb1a1f2cf 100644 --- a/ocaml/libs/xenctrl-ext/xenctrlext.mli +++ b/ocaml/libs/xenctrl-ext/xenctrlext.mli @@ -107,3 +107,5 @@ val domain_claim_pages : handle -> domid -> ?numa_node:NumaNode.t -> int -> unit val get_nr_nodes : handle -> int (** Returns the count of NUMA nodes available in the system. *) + +val physinfo : Xenctrl.handle -> Xenctrl.physinfo diff --git a/ocaml/squeezed/src/dune b/ocaml/squeezed/src/dune index 4e9fe643097..3e4bdd5d3ce 100644 --- a/ocaml/squeezed/src/dune +++ b/ocaml/squeezed/src/dune @@ -13,6 +13,7 @@ threads.posix unix xenctrl + xenctrl_ext xenstore xenstore.unix xenstore_transport diff --git a/ocaml/squeezed/src/squeeze_xen.ml b/ocaml/squeezed/src/squeeze_xen.ml index 31bac6df75b..7808730139d 100644 --- a/ocaml/squeezed/src/squeeze_xen.ml +++ b/ocaml/squeezed/src/squeeze_xen.ml @@ -579,7 +579,7 @@ let make_host ~verbose ~xc = pages -- this might cause something else to fail (eg domain builder?) *) while Int64.div - ((Xenctrl.physinfo xc).Xenctrl.scrub_pages |> Int64.of_nativeint) + ((Xenctrlext.physinfo xc).Xenctrl.scrub_pages |> Int64.of_nativeint) 1024L <> 0L do @@ -762,7 +762,7 @@ let make_host ~verbose ~xc = (* For the host free memory we sum the free pages and the pages needing scrubbing: we don't want to adjust targets simply because the scrubber is slow. *) - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let free_pages_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.free_pages) and scrub_pages_kib = diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/dune b/ocaml/xcp-rrdd/bin/rrdp-cpu/dune index ced826c63a2..de3ad8d497c 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-cpu/dune +++ b/ocaml/xcp-rrdd/bin/rrdp-cpu/dune @@ -11,6 +11,7 @@ xapi-rrd xapi-stdext-unix xenctrl + xenctrl_ext ) ) diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml index a677fd17465..b64f04db0a1 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml @@ -184,7 +184,7 @@ let dss_pcpus xc = let len = Array.length !physcpus in let newinfos = if len = 0 then ( - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let pcpus = physinfo.Xenctrl.nr_cpus in physcpus := if pcpus > 0 then Array.make pcpus 0L else [||] ; Xenctrl.pcpu_info xc pcpus @@ -237,7 +237,7 @@ let count_power_state_running_domains domains = 0 domains let dss_hostload xc domains = - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let pcpus = physinfo.Xenctrl.nr_cpus in let rec sum acc n f = match n with n when n >= 0 -> sum (acc + f n) (n - 1) f | _ -> acc @@ -298,7 +298,7 @@ let _ = let _, domains, _ = Xenctrl_lib.domain_snapshot xc in Process.initialise () ; (* Share one page per PCPU and dom each *) - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let shared_page_count = physinfo.Xenctrl.nr_cpus + Int.max Rrd_interface.max_supported_vms (List.length domains) diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune index 75c8e1f5ab5..ee5f217f137 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/dune @@ -13,6 +13,7 @@ xapi-log xapi-rrd xenctrl + xenctrl_ext xenstore xenstore.unix xenstore_transport diff --git a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml index df49dca259f..09902b1e0f1 100644 --- a/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml +++ b/ocaml/xcp-rrdd/bin/rrdp-squeezed/rrdp_squeezed.ml @@ -169,7 +169,7 @@ let generate_host_sources xc counters = in let memory_reclaimed = bytes_of_kib memory_reclaimed in let memory_possibly_reclaimed = bytes_of_kib memory_possibly_reclaimed in - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let total_kib = Xenctrl.pages_to_kib (Int64.of_nativeint physinfo.Xenctrl.total_pages) in diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 6c65d467f33..e31b07081f3 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -235,7 +235,7 @@ let wait_xen_free_mem ~xc ?(maximum_wait_time_seconds = 64) required_memory_kib : bool = let open Memory in let rec wait accumulated_wait_time_seconds = - let host_info = Xenctrl.physinfo xc in + let host_info = Xenctrlext.physinfo xc in let free_memory_kib = kib_of_pages (Int64.of_nativeint host_info.Xenctrl.free_pages) in @@ -244,7 +244,10 @@ let wait_xen_free_mem ~xc ?(maximum_wait_time_seconds = 64) required_memory_kib in (* At exponentially increasing intervals, write *) (* a debug message saying how long we've waited: *) - if is_power_of_2 accumulated_wait_time_seconds then + if + accumulated_wait_time_seconds = 0 + || is_power_of_2 accumulated_wait_time_seconds + then debug "Waited %i second(s) for memory to become available: %Ld KiB free, %Ld \ KiB scrub, %Ld KiB required" @@ -272,7 +275,7 @@ let wait_xen_free_mem ~xc ?(maximum_wait_time_seconds = 64) required_memory_kib let make ~xc ~xs vm_info vcpus domain_config uuid final_uuid no_sharept num_of_vbds num_of_vifs = let open Xenctrl in - let host_info = Xenctrl.physinfo xc in + let host_info = Xenctrlext.physinfo xc in (* Confirm that the running hypervisor supports a specific capability. *) let assert_capability cap ~on_error = @@ -1000,8 +1003,10 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = __FUNCTION__ domid ; None in - let nr_pages = Int64.div memory 4096L |> Int64.to_int in + let nr_pages = (Int64.div memory 4096L |> Int64.to_int) - 32 in try + D.debug "NUMAClaim domid %d: local claim on node %d: %d pages" domid + node nr_pages ; Xenctrlext.domain_claim_pages xcext domid ~numa_node nr_pages ; set_vcpu_affinity cpu_affinity ; Some (node, memory) @@ -1009,8 +1014,10 @@ let numa_placement domid ~vcpus ~cores ~memory affinity = | Xenctrlext.Not_available -> (* Xen does not provide the interface to claim pages from a single NUMA node, ignore the error and continue. *) + D.debug "NUMAClaim domid %d: local claim not available" domid ; + set_vcpu_affinity cpu_affinity ; None - | Xenctrlext.Unix_error (errno, _) -> + | Xenctrlext.Unix_error ((Unix.ENOMEM as errno), _) -> D.info "%s: unable to claim enough memory, domain %d won't be hosted in a \ single NUMA node. (error %s)" @@ -1109,10 +1116,33 @@ let build_pre ~xc ~xs ~vcpus ~memory ~hard_affinity domid = and cores = Xenops_server.cores_of_numa_affinity_policy pin ~vcpus in - numa_placement domid ~vcpus ~cores - ~memory:(Int64.mul memory.xen_max_mib 1048576L) - affinity - |> Option.map fst + let memory = + Int64.(mul memory.build_start_mib (shift_left 1L 20)) + in + match numa_placement domid ~vcpus ~cores ~memory affinity with + | None -> + (* Always perform a global claim when NUMA placement is + enabled, and single node claims failed or were + unavailable: + This tries to ensures that memory allocated for this + domain won't use up memory claimed by other domains. + If claims are mixed with non-claims then Xen can't + currently guarantee that it would honour the existing + claims. + A failure here is a hard failure: we'd fail allocating + memory later anyway + *) + let nr_pages = + (Int64.div memory 4096L |> Int64.to_int) - 32 + in + let xcext = Xenctrlext.get_handle () in + D.debug "NUMAClaim domid %d: global claim: %d pages" domid + nr_pages ; + Xenctrlext.domain_claim_pages xcext domid + ~numa_node:Xenctrlext.NumaNode.none nr_pages ; + None + | Some (plan, _) -> + Some plan ) in let store_chan, console_chan = create_channels ~xc uuid domid in diff --git a/ocaml/xenopsd/xc/emu_manager.ml b/ocaml/xenopsd/xc/emu_manager.ml index 9f05127d4c5..501aedb55d5 100644 --- a/ocaml/xenopsd/xc/emu_manager.ml +++ b/ocaml/xenopsd/xc/emu_manager.ml @@ -205,7 +205,7 @@ let non_debug_receive ?debug_callback cnx = let open Memory in let open Int64 in let open Xenctrl in - let p = Xenctrl.physinfo xc in + let p = Xenctrlext.physinfo xc in error "Memory F %Ld KiB S %Ld KiB T %Ld MiB" (p.free_pages |> of_nativeint |> kib_of_pages) (p.scrub_pages |> of_nativeint |> kib_of_pages) diff --git a/ocaml/xenopsd/xc/memory_breakdown.ml b/ocaml/xenopsd/xc/memory_breakdown.ml index d5c3dbc79fe..21b53bcc910 100644 --- a/ocaml/xenopsd/xc/memory_breakdown.ml +++ b/ocaml/xenopsd/xc/memory_breakdown.ml @@ -217,7 +217,7 @@ let print_memory_field_names () = (** Prints memory field values to the console. *) let print_memory_field_values xc xs = - let host = Xenctrl.physinfo xc in + let host = Xenctrlext.physinfo xc in let control_domain_info = Xenctrl.domain_getinfo xc 0 in let control_domain_id = control_domain_info.Xenctrl.handle in let guests = diff --git a/ocaml/xenopsd/xc/memory_summary.ml b/ocaml/xenopsd/xc/memory_summary.ml index 21c3b8add6f..3b16a42701e 100644 --- a/ocaml/xenopsd/xc/memory_summary.ml +++ b/ocaml/xenopsd/xc/memory_summary.ml @@ -38,7 +38,7 @@ let _ = finished := !delay < 0. ; if !delay > 0. then Unix.sleepf !delay ; flush stdout ; - let physinfo = Xenctrl.physinfo xc in + let physinfo = Xenctrlext.physinfo xc in let one_page = 4096L in let total_pages = Int64.of_nativeint physinfo.Xenctrl.total_pages in let free_pages = diff --git a/ocaml/xenopsd/xc/xenguestHelper.ml b/ocaml/xenopsd/xc/xenguestHelper.ml index 06a28d92f33..64c23b12c95 100644 --- a/ocaml/xenopsd/xc/xenguestHelper.ml +++ b/ocaml/xenopsd/xc/xenguestHelper.ml @@ -205,7 +205,7 @@ let non_debug_receive ?debug_callback cnx = let open Memory in let open Int64 in let open Xenctrl in - let p = Xenctrl.physinfo xc in + let p = Xenctrlext.physinfo xc in (match log_type with Syslog.Debug -> debug | _ -> error) "Memory F %Ld KiB S %Ld KiB T %Ld MiB" (p.free_pages |> of_nativeint |> kib_of_pages) diff --git a/ocaml/xenopsd/xc/xenops_server_xen.ml b/ocaml/xenopsd/xc/xenops_server_xen.ml index 8b4d0a4b40a..d3011f2d8fc 100644 --- a/ocaml/xenopsd/xc/xenops_server_xen.ml +++ b/ocaml/xenopsd/xc/xenops_server_xen.ml @@ -1064,7 +1064,7 @@ module HOST = struct let pages_per_mib = 256L in Int64.( div - ((Xenctrl.physinfo xc).Xenctrl.total_pages |> of_nativeint) + ((Xenctrlext.physinfo xc).Xenctrl.total_pages |> of_nativeint) pages_per_mib ) )