2
0
mirror of https://github.com/xcat2/xcat-core.git synced 2025-05-22 11:42:05 +00:00

add kvm features: cpu pining, memory binding and pci passthrough

This commit is contained in:
immarvin 2015-10-24 10:12:36 -04:00
parent 0b9653229e
commit c5f1075aee
4 changed files with 352 additions and 8 deletions

View File

@ -222,7 +222,7 @@ vm => {
'vncport' => 'Tracks the current VNC display port (currently not meant to be set',
'textconsole' => 'Tracks the Psuedo-TTY that maps to the serial port or console of a VM',
'powerstate' => "This flag is used by xCAT to track the last known power state of the VM.",
'othersettings' => "This allows specifying a semicolon delimited list of key->value pairs to include in a vmx file of VMware. For partitioning on normal power machines, this option is used to specify the hugepage and/or bsr information, the value is like:'hugepage:1,bsr=2'.",
'othersettings' => "This allows specifying a semicolon delimited list of key->value pairs to include in a vmx file of VMware or KVM. For partitioning on normal power machines, this option is used to specify the hugepage and/or bsr information, the value is like:'hugepage:1,bsr=2'. For KVM cpu pinning, this option is used to specify the physical cpu set on the host, the value is like:'vcpupin:'0-15,^8' ',Its syntax is a comma separated list and a special markup using '-' and '^' (ex. '0-4', '0-3,^2') can also be allowed, the '-' denotes the range and the '^' denotes exclusive. For KVM memory binding, the value is like:'membind:0', restrict a guest to allocate memory from the specified set of NUMA nodes. For PCI passthrough, the value is like:'devpassthrough:pci_0001_01_00_0,pci_0000_03_00_0',the PCI devices are assigned to a virtual machine, and the virtual machine can use this I/O exclusively,the devices list are a list of PCI device names delimited with comma, the PCI device names can be obtained by running B<virsh nodedev-list> on the host.",
'guestostype' => "This allows administrator to specify an identifier for OS to pass through to virtualization stack. Normally this should be ignored as xCAT will translate from nodetype.os rather than requiring this field be used\n",
'beacon' => "This flag is used by xCAT to track the state of the identify LED with respect to the VM.",
'datacenter' => "Optionally specify a datacenter for the VM to exist in (only applicable to VMWare)",

View File

@ -250,6 +250,9 @@ my %usage = (
[add_vmnics=vlan1,vlan2] [add_vmstorage=<N|viosnode:slotid>] [--vios]
chvm <noderange> [del_physlots=drc_index1,drc_index2...]
chvm <noderange> [del_vadapter=slotid]
chvm <noderange> [--cpupin hostcpuset]
chvm <noderange> [--membind numanodeset]
chvm <noderange> [--devpassthru pcidevice1,pcidevice2... ]
VMware specific:
chvm <noderange> [-a size][-d disk][-p disk][--resize disk=size][--cpus count][--mem memory]
zVM specific:

View File

@ -28,7 +28,15 @@ B<chvm> I<noderange> [B<vmcpus=min/req/max>] [B<vmmemory=min/req/max>]
[B<add_vmnics=vlan1[,vlan2..]]> [B<add_vmstorage=<N|viosnode:slotid>>] [B<--vios>]
[B<del_physlots=drc_index1,drc_index2...>]
[B<del_vadapter=slotid>]
B<chvm> I<noderange> [B<--cpupin> I<hostcpuset>]
B<chvm> I<noderange> [B<--membind> I<numanodeset>]
B<chvm> I<noderange> [B<--devpassthru> I<pcidevice>...]
=head2 VMware/KVM specific:
B<chvm> I<noderange> [B<-a> I<size>] [B<-d> I<disk>] [B<-p> I<disk>] [B<--resize> B<disk>=I<size>] [B<--cpus> I<count>] [B<--mem> I<memory>]
@ -215,6 +223,30 @@ To delete physical slots which are specified by the I<drc_index1,drc_index2...>.
To delete a virtual adapter specified by the I<slotid>.
=item B<--cpupin hostcpuset>
To pin guest domain virtual CPUs to physical host CPUs specified with I<hostcpuset>.
I<hostcpuset> is a list of physical CPU numbers. Its syntax is a comma separated list and a special
markup using '-' and '^' (ex. '0-4', '0-3,^2') can also be allowed. The '-' denotes the range and
the '^' denotes exclusive.
Note: The expression is sequentially evaluated, so "0-15,^8" is identical to "9-14,0-7,15" but not
identical to "^8,0-15".
=item B<--membind numanodeset>
It is possible to restrict a guest to allocate memory from the specified set of NUMA nodes I<numanodeset>.
If the guest vCPUs are also pinned to a set of cores located on that same set of NUMA nodes, memory
access will be local, which improves memory access performance.
=item B<--devpassthru pcidevice1,pcidevice2...>
The PCI passthroug gives a guest VM direct access to I/O devices I<pcidevice1,pcidevice2...>.
The PCI devices are assigned to a virtual machine, and the virtual machine can use this I/O exclusively.
The devices list are a list of PCI device names delimited with comma, the PCI device names can be obtained
by running B<virsh nodedev-list> on the host.
=back
=head2 VMware/KVM specific:

View File

@ -10,7 +10,6 @@ use xCAT::GlobalDef;
use xCAT::NodeRange;
use xCAT::VMCommon;
use xCAT_monitoring::monitorctrl;
use xCAT::Table;
use xCAT::Usage;
use XML::LibXML; #now that we are in the business of modifying xml data, need something capable of preserving more of the XML structure
@ -41,10 +40,9 @@ require Sys::Virt;
if (Sys::Virt->VERSION =~ /^0\.[10]\./) {
die;
}
use XML::Simple;
$XML::Simple::PREFERRED_PARSER='XML::Parser';
use Data::Dumper;
#use Data::Dumper;
use POSIX "WNOHANG";
use Storable qw(freeze thaw store_fd fd_retrieve);
use IO::Select;
@ -440,7 +438,14 @@ sub reconfigvm {
sub build_oshash {
my %rethash;
$rethash{type}->{content}='hvm';
$rethash{bios}->{useserial}='yes';
#$rethash{bios}->{useserial}='yes';
my $hypcpumodel = $confdata->{$confdata->{vm}->{$node}->[0]->{host}}->{cpumodel};
unless (defined($hypcpumodel) and $hypcpumodel eq "ppc64le") {
$rethash{bios}->{useserial}='yes';
}
if (defined $confdata->{vm}->{$node}->[0]->{bootorder}) {
my $bootorder = $confdata->{vm}->{$node}->[0]->{bootorder};
my @bootdevs = split(/[:,]/,$bootorder);
@ -688,7 +693,78 @@ sub build_xmldesc {
} else {
$xtree{memory}->{content}=524288;
}
if ($hypcpumodel eq "ppc64") {
print "xxx $confdata->{vm}->{$node}->[0]->{othersettings} ";
my %cpupinhash;
my @prdevices;
my $memnumanodes;
my $advsettings=undef;
if (defined $confdata->{vm}->{$node}->[0]->{othersettings}) {
$advsettings=$confdata->{vm}->{$node}->[0]->{othersettings};
}
if($advsettings){
my @tmp_array=split ";",$advsettings;
foreach(@tmp_array){
if(/vcpupin:['"]?([^:'"]*)['"]?:?['"]?([^:'"]*)['"]?/){
if($2){
$cpupinhash{$1}=$2;
}else{
$cpupinhash{ALL}=$1;
}
}
if(/devpassthrough:(.*)/){
@prdevices=split ",",$1;
}
if(/membind:(.*)/){
$memnumanodes=$1;
}
}
}
if(defined $memnumanodes){
my %numatunehash;
$numatunehash{memory}=[{nodeset=>"$memnumanodes"}];
$xtree{numatune}=\%numatunehash;
}
if(exists $cpupinhash{ALL}){
$xtree{vcpu}->{placement}='static';
$xtree{vcpu}->{cpuset}="$cpupinhash{ALL}";
$xtree{vcpu}->{cpuset}=~s/\"\'//g;
}
my @prdevarray;
foreach my $devname(@prdevices){
my $devobj=$hypconn->get_node_device_by_name($devname);
unless($devobj){
return -1;
}
my $devxml=$devobj->get_xml_description();
unless($devxml){
return -1;
}
my $devhash=XMLin($devxml);
if(defined $devhash->{capability}->{type} and $devhash->{capability}->{type} =~ /pci/i ){
my %tmphash;
$tmphash{mode}='subsystem';
$tmphash{type}=$devhash->{capability}->{type};
$tmphash{managed}="yes";
$tmphash{driver}->{name}="vfio";
$tmphash{source}->{address}->[0]=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}};
push(@prdevarray,\%tmphash);
}
}
$xtree{devices}->{hostdev}=\@prdevarray;
if ($hypcpumodel eq "ppc64" or $hypcpumodel eq "ppc64le") {
#if ($hypcpumodel eq "ppc64") {
my %cpuhash = ();
if ($hypcputype) {
$cpuhash{model} = $hypcputype;
@ -771,7 +847,7 @@ sub build_xmldesc {
if (defined($hypcpumodel) and $hypcpumodel eq 'ppc64') {
$xtree{devices}->{emulator}->{content} = "/usr/bin/qemu-system-ppc64";
} else {
$xtree{devices}->{sound}->{model}='ac97';
#$xtree{devices}->{sound}->{model}='ac97';
}
$xtree{devices}->{console}->{type}='pty';
@ -1625,6 +1701,9 @@ sub chvm {
my $memory;
my $cdrom;
my $eject;
my $pcpuset;
my $numanodeset;
my $prdevices;
@ARGV=@_;
require Getopt::Long;
GetOptions(
@ -1636,6 +1715,9 @@ sub chvm {
"cpus|cpu=s" => \$cpucount,
"p=s"=>\@purge,
"resize=s%" => \%resize,
"cpupin=s" => \$pcpuset,
"membind=s" => \$numanodeset,
"devpassthru=s"=> \$prdevices,
);
if (@derefdisks) {
xCAT::SvrUtils::sendmsg([1,"Detach without purge TODO for kvm"],$callback,$node);
@ -1645,7 +1727,23 @@ sub chvm {
xCAT::SvrUtils::sendmsg([1,"Currently adding and purging concurrently is not supported"],$callback,$node);
return;
}
if(defined $pcpuset){
$pcpuset=~s/["']//g;
if("###$pcpuset" eq "###" or $pcpuset =~ /[^\d\,\^\-]/ ){
xCAT::SvrUtils::sendmsg([1,"cpu pining: invalid cpuset"],$callback,$node);
return;
}
}
if(defined $numanodeset){
$numanodeset=~s/["']//g;
if("###$numanodeset" eq "###" or $numanodeset =~ /[^\d\,\^\-]/ ){
xCAT::SvrUtils::sendmsg([1,"memory binding: invalid NUMA nodeset"],$callback,$node);
return;
}
}
my %useddisks;
my $dom;
@ -1914,6 +2012,217 @@ sub chvm {
}
}
if(defined $pcpuset){
if ($currstate eq 'on') {
#get the vcpuinfo of the domain
my @vcpuinfo;
eval { @vcpuinfo = $dom->get_vcpu_info(); };
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
return;
}
#get the cpuinfo of the host
my ($totcpus, $onlinemap, $totonline);
eval { ($totcpus, $onlinemap, $totonline) = $hypconn->get_node_cpu_map(); };
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
return;
}
#convert the cpuset to bitmap,which is required by pin_vcpu()
my @pcpumaparr=(0) x $totcpus;
my @cpurange=split ",",$pcpuset;
foreach my $rangeslice (@cpurange){
if( $rangeslice =~ /^(\d*)-(\d*)$/ ){
my ($left,$right)=($1,$2);
@pcpumaparr[$left .. $right]=(1) x ($right-$left+1);
}elsif($rangeslice =~ /^\^(\d*)$/){
$pcpumaparr[$1]=0;
}elsif($rangeslice =~ /^(\d*)$/){
$pcpumaparr[$1]=1;
}
}
my $pcpumap=join //,@pcpumaparr;
my $mask=pack("b*",$pcpumap);
#Pin the virtual CPU given to physical CPUs given
foreach(@vcpuinfo){
eval {
$dom->pin_vcpu($_->{number}, $mask);
};
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
}
}
$vmxml=$dom->get_xml_description();
my $parsed=$parser->parse_string($vmxml);
my $ref=$parsed->findnodes("/domain/vcpu");
#$ref->[0]->setAttribute("cpuset","$pcpuset");
$ref->[0]->removeAttribute("cpuset");
$vmxml=$parsed->toString;
if ($vmxml) {
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
}else{
my $parsed=$parser->parse_string($vmxml);
my $ref=$parsed->findnodes("/domain/vcpu");
$ref->[0]->setAttribute("cpuset",$pcpuset);
#for virtual CPUs which have the vcpupin specified,
# the cpuset specified by/domain/vcpu/cpuset will be ignored
my $ref=$parsed->findnodes("/domain");
my $cputuneref=$parsed->findnodes("/domain/cputune");
$cputuneref->[0]->parentNode->removeChild($cputuneref->[0]);
$vmxml=$parsed->toString;
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
}
if(defined $numanodeset){
if ($currstate eq 'on') {
eval {
my %tmphash=(Sys::Virt::Domain->NUMA_NODESET => "$numanodeset");
$dom->set_numa_parameters(\%tmphash);
};
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
return;
}
$vmxml=$dom->get_xml_description();
if ($vmxml) {
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
}else{
my %numatunehash;
$numatunehash{memory}=[{nodeset=>"$numanodeset"}];
my $numatunexml=XMLout(\%numatunehash,RootName=>"numatune");
my $parsed=$parser->parse_string($vmxml);
my $numatuneref=$parsed->findnodes("/domain/numatune");
if($numatuneref)
{
#/domain/numatune exist,modify the numatune/nodeset
my $ref=$parsed->findnodes("/domain/numatune/memory");
$ref->[0]->setAttribute("nodeset",$numanodeset);
}else{
#/domain/numatune does not exist,create one
my $ref=$parsed->findnodes("/domain");
my $numatunenode = $parser->parse_balanced_chunk($numatunexml);
$ref->[0]->appendChild($numatunenode);
}
$vmxml=$parsed->toString;
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
}
$prdevices=~s/["']//g;
if(defined $prdevices){
my @prdevarr=split ",",$prdevices;
unless(scalar @prdevarr){
xCAT::SvrUtils::sendmsg([1,"device passthrough: no device specified"],$callback,$node);
return;
}
foreach my $devname (@prdevarr){
my $devobj;
eval {
$devobj=$hypconn->get_node_device_by_name($devname);
};
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
next;
}
my $devxml=$devobj->get_xml_description();
unless($devxml){
next;
}
my $devhash=XMLin($devxml);
if(defined $devhash->{capability}->{type} and $devhash->{capability}->{type} =~ /pci/i ){
my %tmphash;
$tmphash{mode}='subsystem';
$tmphash{type}=$devhash->{capability}->{type};
$tmphash{managed}="yes";
$tmphash{driver}->{name}="vfio";
$tmphash{source}->{address}->[0]=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}};
my $newxml=XMLout(\%tmphash,RootName=>"hostdev");
if ($currstate eq 'on') {
eval {
$dom->attach_device($newxml);
};
if ($@) {
xCAT::SvrUtils::sendmsg([1,"$@"],$callback,$node);
next;
}else{
$vmxml=$dom->get_xml_description();
if ($vmxml) {
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
xCAT::SvrUtils::sendmsg([0,"passthrough: $devname attached to guest successfully "],$callback,$node);
}
}else{
my $hostdevfound;
$hostdevfound=0;
my $parsed=$parser->parse_string($vmxml);
my $ref=$parsed->findnodes("/domain/devices");
my @hostdevlist=$ref->[0]->findnodes("./hostdev");
#check whether the hostdev existed in guest xml
foreach my $hostdevref (@hostdevlist){
my $devaddrref=$hostdevref->findnodes("./source/address");
my $domain=$devaddrref->[0]->getAttribute("domain");
my $bus=$devaddrref->[0]->getAttribute("bus");
my $slot=$devaddrref->[0]->getAttribute("slot");
my $function=$devaddrref->[0]->getAttribute("function");
my $curdevaddr=\%{$devhash->{'capability'}->{'iommuGroup'}->{'address'}};
if(("$curdevaddr->{domain}" eq "$domain") and
("$curdevaddr->{bus}" eq "$bus") and
("$curdevaddr->{slot}" eq "$slot") and
("$curdevaddr->{function}" eq "$function")){
#hostdev existed in guest xml
$hostdevfound=1;
goto PROCESS_HOSTDEV_XML;
}
}
PROCESS_HOSTDEV_XML:
unless($hostdevfound){
#hostdev does not exist,add into guest xml
my $hostdevnode = $parser->parse_balanced_chunk($newxml);
$ref->[0]->appendChild($hostdevnode);
$vmxml=$parsed->toString;
$updatetable->{kvm_nodedata}->{$node}->{xml}=$vmxml;
}
}
}
}
}
}
sub get_disks_by_userspecs {
my $specs = shift;