From c5f1075aeed48907b3a9f4cc9302f723f08d4eba Mon Sep 17 00:00:00 2001 From: immarvin Date: Sat, 24 Oct 2015 10:12:36 -0400 Subject: [PATCH] add kvm features: cpu pining, memory binding and pci passthrough --- perl-xCAT/xCAT/Schema.pm | 2 +- perl-xCAT/xCAT/Usage.pm | 3 + xCAT-client/pods/man1/chvm.1.pod | 34 ++- xCAT-server/lib/xcat/plugins/kvm.pm | 321 +++++++++++++++++++++++++++- 4 files changed, 352 insertions(+), 8 deletions(-) diff --git a/perl-xCAT/xCAT/Schema.pm b/perl-xCAT/xCAT/Schema.pm index 7fb5347e6..6f161c83a 100755 --- a/perl-xCAT/xCAT/Schema.pm +++ b/perl-xCAT/xCAT/Schema.pm @@ -222,7 +222,7 @@ vm => { 'vncport' => 'Tracks the current VNC display port (currently not meant to be set', 'textconsole' => 'Tracks the Psuedo-TTY that maps to the serial port or console of a VM', 'powerstate' => "This flag is used by xCAT to track the last known power state of the VM.", - 'othersettings' => "This allows specifying a semicolon delimited list of key->value pairs to include in a vmx file of VMware. For partitioning on normal power machines, this option is used to specify the hugepage and/or bsr information, the value is like:'hugepage:1,bsr=2'.", + 'othersettings' => "This allows specifying a semicolon delimited list of key->value pairs to include in a vmx file of VMware or KVM. For partitioning on normal power machines, this option is used to specify the hugepage and/or bsr information, the value is like:'hugepage:1,bsr=2'. For KVM cpu pinning, this option is used to specify the physical cpu set on the host, the value is like:'vcpupin:'0-15,^8' ',Its syntax is a comma separated list and a special markup using '-' and '^' (ex. '0-4', '0-3,^2') can also be allowed, the '-' denotes the range and the '^' denotes exclusive. For KVM memory binding, the value is like:'membind:0', restrict a guest to allocate memory from the specified set of NUMA nodes. For PCI passthrough, the value is like:'devpassthrough:pci_0001_01_00_0,pci_0000_03_00_0',the PCI devices are assigned to a virtual machine, and the virtual machine can use this I/O exclusively,the devices list are a list of PCI device names delimited with comma, the PCI device names can be obtained by running B on the host.", 'guestostype' => "This allows administrator to specify an identifier for OS to pass through to virtualization stack. Normally this should be ignored as xCAT will translate from nodetype.os rather than requiring this field be used\n", 'beacon' => "This flag is used by xCAT to track the state of the identify LED with respect to the VM.", 'datacenter' => "Optionally specify a datacenter for the VM to exist in (only applicable to VMWare)", diff --git a/perl-xCAT/xCAT/Usage.pm b/perl-xCAT/xCAT/Usage.pm index 4da60dab9..ff691baa5 100755 --- a/perl-xCAT/xCAT/Usage.pm +++ b/perl-xCAT/xCAT/Usage.pm @@ -250,6 +250,9 @@ my %usage = ( [add_vmnics=vlan1,vlan2] [add_vmstorage=] [--vios] chvm [del_physlots=drc_index1,drc_index2...] chvm [del_vadapter=slotid] + chvm [--cpupin hostcpuset] + chvm [--membind numanodeset] + chvm [--devpassthru pcidevice1,pcidevice2... ] VMware specific: chvm [-a size][-d disk][-p disk][--resize disk=size][--cpus count][--mem memory] zVM specific: diff --git a/xCAT-client/pods/man1/chvm.1.pod b/xCAT-client/pods/man1/chvm.1.pod index fca930e33..4578254d8 100644 --- a/xCAT-client/pods/man1/chvm.1.pod +++ b/xCAT-client/pods/man1/chvm.1.pod @@ -28,7 +28,15 @@ B I [B] [B] [B [B>] [B<--vios>] [B] [B] - + +B I [B<--cpupin> I] + +B I [B<--membind> I] + +B I [B<--devpassthru> I...] + + + =head2 VMware/KVM specific: B I [B<-a> I] [B<-d> I] [B<-p> I] [B<--resize> B=I] [B<--cpus> I] [B<--mem> I] @@ -215,6 +223,30 @@ To delete physical slots which are specified by the I. To delete a virtual adapter specified by the I. +=item B<--cpupin hostcpuset> + +To pin guest domain virtual CPUs to physical host CPUs specified with I. +I is a list of physical CPU numbers. Its syntax is a comma separated list and a special +markup using '-' and '^' (ex. '0-4', '0-3,^2') can also be allowed. The '-' denotes the range and +the '^' denotes exclusive. + +Note: The expression is sequentially evaluated, so "0-15,^8" is identical to "9-14,0-7,15" but not +identical to "^8,0-15". + +=item B<--membind numanodeset> + +It is possible to restrict a guest to allocate memory from the specified set of NUMA nodes I. +If the guest vCPUs are also pinned to a set of cores located on that same set of NUMA nodes, memory +access will be local, which improves memory access performance. + +=item B<--devpassthru pcidevice1,pcidevice2...> + +The PCI passthroug gives a guest VM direct access to I/O devices I. +The PCI devices are assigned to a virtual machine, and the virtual machine can use this I/O exclusively. +The devices list are a list of PCI device names delimited with comma, the PCI device names can be obtained +by running B on the host. + + =back =head2 VMware/KVM specific: diff --git a/xCAT-server/lib/xcat/plugins/kvm.pm b/xCAT-server/lib/xcat/plugins/kvm.pm index 9ecf3ae7b..896fe1ec9 100755 --- a/xCAT-server/lib/xcat/plugins/kvm.pm +++ b/xCAT-server/lib/xcat/plugins/kvm.pm @@ -10,7 +10,6 @@ use xCAT::GlobalDef; use xCAT::NodeRange; use xCAT::VMCommon; use xCAT_monitoring::monitorctrl; - use xCAT::Table; use xCAT::Usage; use XML::LibXML; #now that we are in the business of modifying xml data, need something capable of preserving more of the XML structure @@ -41,10 +40,9 @@ require Sys::Virt; if (Sys::Virt->VERSION =~ /^0\.[10]\./) { die; } - use XML::Simple; $XML::Simple::PREFERRED_PARSER='XML::Parser'; -use Data::Dumper; +#use Data::Dumper; use POSIX "WNOHANG"; use Storable qw(freeze thaw store_fd fd_retrieve); use IO::Select; @@ -440,7 +438,14 @@ sub reconfigvm { sub build_oshash { my %rethash; $rethash{type}->{content}='hvm'; - $rethash{bios}->{useserial}='yes'; + #$rethash{bios}->{useserial}='yes'; + + + my $hypcpumodel = $confdata->{$confdata->{vm}->{$node}->[0]->{host}}->{cpumodel}; + unless (defined($hypcpumodel) and $hypcpumodel eq "ppc64le") { + $rethash{bios}->{useserial}='yes'; + } + if (defined $confdata->{vm}->{$node}->[0]->{bootorder}) { my $bootorder = $confdata->{vm}->{$node}->[0]->{bootorder}; my @bootdevs = split(/[:,]/,$bootorder); @@ -688,7 +693,78 @@ sub build_xmldesc { } else { $xtree{memory}->{content}=524288; } - if ($hypcpumodel eq "ppc64") { + + print "xxx $confdata->{vm}->{$node}->[0]->{othersettings} "; + my %cpupinhash; + my @prdevices; + my $memnumanodes; + my $advsettings=undef; + if (defined $confdata->{vm}->{$node}->[0]->{othersettings}) { + $advsettings=$confdata->{vm}->{$node}->[0]->{othersettings}; + } + + if($advsettings){ + my @tmp_array=split ";",$advsettings; + foreach(@tmp_array){ + if(/vcpupin:['"]?([^:'"]*)['"]?:?['"]?([^:'"]*)['"]?/){ + if($2){ + $cpupinhash{$1}=$2; + }else{ + $cpupinhash{ALL}=$1; + } + } + + if(/devpassthrough:(.*)/){ + @prdevices=split ",",$1; + } + + if(/membind:(.*)/){ + $memnumanodes=$1; + } + + } + } + + if(defined $memnumanodes){ + my %numatunehash; + $numatunehash{memory}=[{nodeset=>"$memnumanodes"}]; + $xtree{numatune}=\%numatunehash; + } + + if(exists $cpupinhash{ALL}){ + $xtree{vcpu}->{placement}='static'; + $xtree{vcpu}->{cpuset}="$cpupinhash{ALL}"; + $xtree{vcpu}->{cpuset}=~s/\"\'//g; + } + + my @prdevarray; + foreach my $devname(@prdevices){ + my $devobj=$hypconn->get_node_device_by_name($devname); + unless($devobj){ + return -1; + } + my $devxml=$devobj->get_xml_description(); + unless($devxml){ + return -1; + } + my $devhash=XMLin($devxml); + if(defined $devhash->{capability}->{type} and $devhash->{capability}->{type} =~ /pci/i ){ + my %tmphash; + $tmphash{mode}='subsystem'; + $tmphash{type}=$devhash->{capability}->{type}; + $tmphash{managed}="yes"; + $tmphash{driver}->{name}="vfio"; + $tmphash{source}->{address}->[0]=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}}; + push(@prdevarray,\%tmphash); + + } + } + + $xtree{devices}->{hostdev}=\@prdevarray; + + + if ($hypcpumodel eq "ppc64" or $hypcpumodel eq "ppc64le") { + #if ($hypcpumodel eq "ppc64") { my %cpuhash = (); if ($hypcputype) { $cpuhash{model} = $hypcputype; @@ -771,7 +847,7 @@ sub build_xmldesc { if (defined($hypcpumodel) and $hypcpumodel eq 'ppc64') { $xtree{devices}->{emulator}->{content} = "/usr/bin/qemu-system-ppc64"; } else { - $xtree{devices}->{sound}->{model}='ac97'; + #$xtree{devices}->{sound}->{model}='ac97'; } $xtree{devices}->{console}->{type}='pty'; @@ -1625,6 +1701,9 @@ sub chvm { my $memory; my $cdrom; my $eject; + my $pcpuset; + my $numanodeset; + my $prdevices; @ARGV=@_; require Getopt::Long; GetOptions( @@ -1636,6 +1715,9 @@ sub chvm { "cpus|cpu=s" => \$cpucount, "p=s"=>\@purge, "resize=s%" => \%resize, + "cpupin=s" => \$pcpuset, + "membind=s" => \$numanodeset, + "devpassthru=s"=> \$prdevices, ); if (@derefdisks) { xCAT::SvrUtils::sendmsg([1,"Detach without purge TODO for kvm"],$callback,$node); @@ -1645,7 +1727,23 @@ sub chvm { xCAT::SvrUtils::sendmsg([1,"Currently adding and purging concurrently is not supported"],$callback,$node); return; } + + if(defined $pcpuset){ + $pcpuset=~s/["']//g; + if("###$pcpuset" eq "###" or $pcpuset =~ /[^\d\,\^\-]/ ){ + xCAT::SvrUtils::sendmsg([1,"cpu pining: invalid cpuset"],$callback,$node); + return; + } + } + + if(defined $numanodeset){ + $numanodeset=~s/["']//g; + if("###$numanodeset" eq "###" or $numanodeset =~ /[^\d\,\^\-]/ ){ + xCAT::SvrUtils::sendmsg([1,"memory binding: invalid NUMA nodeset"],$callback,$node); + return; + } + } my %useddisks; my $dom; @@ -1914,6 +2012,217 @@ sub chvm { } } + + if(defined $pcpuset){ + if ($currstate eq 'on') { + #get the vcpuinfo of the domain + my @vcpuinfo; + eval { @vcpuinfo = $dom->get_vcpu_info(); }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + return; + } + + #get the cpuinfo of the host + my ($totcpus, $onlinemap, $totonline); + eval { ($totcpus, $onlinemap, $totonline) = $hypconn->get_node_cpu_map(); }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + return; + } + + #convert the cpuset to bitmap,which is required by pin_vcpu() + my @pcpumaparr=(0) x $totcpus; + my @cpurange=split ",",$pcpuset; + foreach my $rangeslice (@cpurange){ + if( $rangeslice =~ /^(\d*)-(\d*)$/ ){ + my ($left,$right)=($1,$2); + @pcpumaparr[$left .. $right]=(1) x ($right-$left+1); + }elsif($rangeslice =~ /^\^(\d*)$/){ + $pcpumaparr[$1]=0; + }elsif($rangeslice =~ /^(\d*)$/){ + $pcpumaparr[$1]=1; + } + } + + my $pcpumap=join //,@pcpumaparr; + my $mask=pack("b*",$pcpumap); + + #Pin the virtual CPU given to physical CPUs given + foreach(@vcpuinfo){ + eval { + $dom->pin_vcpu($_->{number}, $mask); + }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + } + } + + $vmxml=$dom->get_xml_description(); + my $parsed=$parser->parse_string($vmxml); + my $ref=$parsed->findnodes("/domain/vcpu"); + #$ref->[0]->setAttribute("cpuset","$pcpuset"); + $ref->[0]->removeAttribute("cpuset"); + $vmxml=$parsed->toString; + if ($vmxml) { + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + } + }else{ + my $parsed=$parser->parse_string($vmxml); + my $ref=$parsed->findnodes("/domain/vcpu"); + $ref->[0]->setAttribute("cpuset",$pcpuset); + + #for virtual CPUs which have the vcpupin specified, + # the cpuset specified by/domain/vcpu/cpuset will be ignored + my $ref=$parsed->findnodes("/domain"); + my $cputuneref=$parsed->findnodes("/domain/cputune"); + $cputuneref->[0]->parentNode->removeChild($cputuneref->[0]); + $vmxml=$parsed->toString; + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + } + } + + if(defined $numanodeset){ + if ($currstate eq 'on') { + eval { + my %tmphash=(Sys::Virt::Domain->NUMA_NODESET => "$numanodeset"); + $dom->set_numa_parameters(\%tmphash); + }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + return; + } + $vmxml=$dom->get_xml_description(); + if ($vmxml) { + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + } + }else{ + my %numatunehash; + $numatunehash{memory}=[{nodeset=>"$numanodeset"}]; + my $numatunexml=XMLout(\%numatunehash,RootName=>"numatune"); + + + my $parsed=$parser->parse_string($vmxml); + my $numatuneref=$parsed->findnodes("/domain/numatune"); + + if($numatuneref) + { + #/domain/numatune exist,modify the numatune/nodeset + my $ref=$parsed->findnodes("/domain/numatune/memory"); + $ref->[0]->setAttribute("nodeset",$numanodeset); + }else{ + #/domain/numatune does not exist,create one + my $ref=$parsed->findnodes("/domain"); + my $numatunenode = $parser->parse_balanced_chunk($numatunexml); + $ref->[0]->appendChild($numatunenode); + } + $vmxml=$parsed->toString; + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + } + } + + + + $prdevices=~s/["']//g; + if(defined $prdevices){ + my @prdevarr=split ",",$prdevices; + unless(scalar @prdevarr){ + xCAT::SvrUtils::sendmsg([1,"device passthrough: no device specified"],$callback,$node); + return; + } + + foreach my $devname (@prdevarr){ + my $devobj; + eval { + $devobj=$hypconn->get_node_device_by_name($devname); + }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + next; + } + + my $devxml=$devobj->get_xml_description(); + unless($devxml){ + next; + } + my $devhash=XMLin($devxml); + if(defined $devhash->{capability}->{type} and $devhash->{capability}->{type} =~ /pci/i ){ + my %tmphash; + $tmphash{mode}='subsystem'; + $tmphash{type}=$devhash->{capability}->{type}; + $tmphash{managed}="yes"; + $tmphash{driver}->{name}="vfio"; + $tmphash{source}->{address}->[0]=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}}; + + my $newxml=XMLout(\%tmphash,RootName=>"hostdev"); + + if ($currstate eq 'on') { + eval { + $dom->attach_device($newxml); + }; + if ($@) { + xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node); + next; + }else{ + $vmxml=$dom->get_xml_description(); + if ($vmxml) { + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + } + xCAT::SvrUtils::sendmsg([0,"passthrough: $devname attached to guest successfully "],$callback,$node); + } + + }else{ + + my $hostdevfound; + $hostdevfound=0; + my $parsed=$parser->parse_string($vmxml); + my $ref=$parsed->findnodes("/domain/devices"); + + my @hostdevlist=$ref->[0]->findnodes("./hostdev"); + #check whether the hostdev existed in guest xml + foreach my $hostdevref (@hostdevlist){ + my $devaddrref=$hostdevref->findnodes("./source/address"); + + my $domain=$devaddrref->[0]->getAttribute("domain"); + my $bus=$devaddrref->[0]->getAttribute("bus"); + my $slot=$devaddrref->[0]->getAttribute("slot"); + my $function=$devaddrref->[0]->getAttribute("function"); + + my $curdevaddr=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}}; + + + + if(("$curdevaddr->{domain}" eq "$domain") and + ("$curdevaddr->{bus}" eq "$bus") and + ("$curdevaddr->{slot}" eq "$slot") and + ("$curdevaddr->{function}" eq "$function")){ + + #hostdev existed in guest xml + $hostdevfound=1; + goto PROCESS_HOSTDEV_XML; + } + } + + + + PROCESS_HOSTDEV_XML: + unless($hostdevfound){ + #hostdev does not exist,add into guest xml + my $hostdevnode = $parser->parse_balanced_chunk($newxml); + $ref->[0]->appendChild($hostdevnode); + $vmxml=$parsed->toString; + $updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml; + + } + + + } + } + + } + } + + } sub get_disks_by_userspecs { my $specs = shift;