confluent/confluent_client/bin/stats

#!/usr/bin/python2
# vim: tabstop=4 shiftwidth=4 softtabstop=4

# Copyright 2019 Lenovo
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import csv
import fcntl
import io
import numpy as np

import os
import subprocess
import sys

try:
    import sixel

    class DumbWriter(sixel.SixelWriter):
        def restore_position(self, output):
            return
except ImportError:
    pass


def plot(gui, output, plotdata, bins):
    import matplotlib as mpl
    if gui and mpl.get_backend() == 'agg':
        sys.stderr.write('Error: No GUI backend available and -g specified!\n')
    if not gui:
        mpl.use('Agg')
    import matplotlib.pyplot as plt
    n, bins, patches = plt.hist(plotdata, bins)
    plt.show()
    if not gui:
        if output:
            tdata = output
        else:
            tdata = io.BytesIO()
        plt.savefig(tdata)
    if not gui and not output:
        writer = DumbWriter()
        writer.draw(tdata)
    return n, bins

def textplot(plotdata, bins):
    n, bins = np.histogram(plotdata, bins)
    labels = []
    for bin in bins:
        labels.append('{0:0.1f}'.format(bin))
    width = 80
    # Since this will be primarily piped into, hard to get 
    # terminal width
    labelwidth = 0
    for lab in labels:
        if len(lab) > labelwidth:
            labelwidth = len(lab)
    width -= (labelwidth) + 1
    labelfmt = '{{0:>{0}s}}|'.format(labelwidth)
    maxn = 0.0
    for lgth in n:
        if lgth > maxn:
            maxn = float(lgth)
    for i in range(len(n)):
        print(labelfmt.format(labels[i]) + '=' * int(np.round((n[i]/maxn) * width)))
    return n, bins

histogram = False
aparser = argparse.ArgumentParser(description='Quick access to common statistics')
aparser.add_argument('-c', type=int, default=0, help='Column number to analyze (default is last column)')
aparser.add_argument('-d', default=None, help='Value used to separate columns')
aparser.add_argument('-x', default=False, action='store_true', help='Output histogram in sixel format')
aparser.add_argument('-s', default=0, help='Number of header lines to skip before processing')
aparser.add_argument('-g', default=False, action='store_true', help='Open histogram in separate graphical window')
aparser.add_argument('-o', default=None, help='Output histogram to the specified filename in PNG format')
aparser.add_argument('-t', default=False, action='store_true', help='Output a histogram in text format')
aparser.add_argument('-v', default=False, action='store_true', help='Attempt to list nodes relevant to each histogram bar (requires -s, -o, or -t)')
aparser.add_argument('-b', type=int, default=10, help='Number of bins to use in histogram (default is 10)')
args = aparser.parse_args(sys.argv[1:])
plotdata = []
headlines = int(args.s)
while headlines >= 0:
    data = sys.stdin.readline()
    headlines -= 1
if args.d:
    delimiter = args.d
else:
    if '\t' in data:
        delimiter = '\t'
    elif ' ' in data:
        delimiter = ' '
    elif ',' in data:
        delimiter = ','
    else:
        delimiter = ' '  # handle single column
data = list(csv.reader([data], delimiter=delimiter))[0]
nodebydatum = {}
idx = args.c - 1
autoidx = False
while data:
    node = None
    if ':' in data[0]:
        node, data[0] = data[0].split(':', 1)
    else:
        node = data[0]
    if idx == -1 and not autoidx:
        while not autoidx:
            try:
                datum = float(data[idx])
            except ValueError:
                idx -= 1
                continue
            except IndexError:
                sys.stderr.write('Unable to identify a numerical column\n')
                sys.exit(1)
            autoidx = True
    else:
        datum = float(data[idx])
    if node:
        if datum in nodebydatum:
            nodebydatum[datum].add(node)
        else:
            nodebydatum[datum] = set([node])
    plotdata.append(datum)
    data = sys.stdin.readline()
    data = list(csv.reader([data], delimiter=delimiter))[0]
n = None
if args.g or args.o or args.x:
    n, bins = plot(args.g, args.o, plotdata, bins=args.b)
if args.t:
    n, bins = textplot(plotdata, bins=args.b)
print('Samples: {5} Min: {3} Median: {0} Mean: {1} Max: {4} StandardDeviation: {2} Sum: {6}'.format(np.median(plotdata), np.mean(plotdata), np.std(plotdata), np.min(plotdata), np.max(plotdata), len(plotdata), np.sum(plotdata)))
if args.v and n is not None and nodebydatum:
    print('')
    currbin = bins[0]
    bins = bins[1:]
    currbinmembers = []
    for datum in sorted(nodebydatum):
        if datum > bins[0]:
           nextbin = None
           endbin = bins[0]
           while len(bins) and bins[0] < datum:
                nextbin = bins[0]
                bins = bins[1:]
           if not nextbin:
               nextbin = np.max(plotdata)
           print('Entries between {0} and {1}'.format(currbin, endbin))
           currbin = nextbin
           print('-' * 80)
           print(','.join(sorted(currbinmembers)))
           print('')
           print('')
           currbinmembers = []
        for node in nodebydatum[datum]:
            currbinmembers.append(node)
    if currbinmembers:
       print('Entries between {0} and {1}'.format(currbin, np.max(plotdata)))
       print('-' * 80)
       print(','.join(sorted(currbinmembers)))
       print('')
       print('')
Make /usr/bin/env python point to python2 Same as before, more RHEL8 compatibility changes 2019-09-23 11:04:52 -04:00			`#!/usr/bin/python2`
Remove use of tmp file in stats 2019-04-25 13:59:15 -04:00			`# vim: tabstop=4 shiftwidth=4 softtabstop=4`

			`# Copyright 2019 Lenovo`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`import argparse`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`import csv`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`import fcntl`
Remove use of tmp file in stats 2019-04-25 13:59:15 -04:00			`import io`
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00			`import numpy as np`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00			`import os`
			`import subprocess`
			`import sys`

Move sixel under opportunistic import Do not require sixel to run stats. 2021-01-14 15:49:30 -05:00			`try:`
			`import sixel`

			`class DumbWriter(sixel.SixelWriter):`
			`def restore_position(self, output):`
			`return`
			`except ImportError:`
			`pass`
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00

Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`def plot(gui, output, plotdata, bins):`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`import matplotlib as mpl`
Update stats Updated stats script with lines 40-41 Added: if gui and mpl.get_backend() == 'agg': sys.stderr.write('Error: No GUI backend available and -g specified!\n') 2021-03-15 13:57:29 +02:00			`if gui and mpl.get_backend() == 'agg':`
			`sys.stderr.write('Error: No GUI backend available and -g specified!\n')`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`if not gui:`
			`mpl.use('Agg')`
			`import matplotlib.pyplot as plt`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`n, bins, patches = plt.hist(plotdata, bins)`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`plt.show()`
			`if not gui:`
			`if output:`
			`tdata = output`
			`else:`
			`tdata = io.BytesIO()`
			`plt.savefig(tdata)`
			`if not gui and not output:`
			`writer = DumbWriter()`
			`writer.draw(tdata)`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`return n, bins`

			`def textplot(plotdata, bins):`
			`n, bins = np.histogram(plotdata, bins)`
			`labels = []`
			`for bin in bins:`
			`labels.append('{0:0.1f}'.format(bin))`
			`width = 80`
			`# Since this will be primarily piped into, hard to get`
			`# terminal width`
			`labelwidth = 0`
			`for lab in labels:`
			`if len(lab) > labelwidth:`
			`labelwidth = len(lab)`
			`width -= (labelwidth) + 1`
			`labelfmt = '{{0:>{0}s}}\|'.format(labelwidth)`
			`maxn = 0.0`
			`for lgth in n:`
			`if lgth > maxn:`
			`maxn = float(lgth)`
			`for i in range(len(n)):`
			`print(labelfmt.format(labels[i]) + '=' * int(np.round((n[i]/maxn) * width)))`
			`return n, bins`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00
			`histogram = False`
			`aparser = argparse.ArgumentParser(description='Quick access to common statistics')`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`aparser.add_argument('-c', type=int, default=0, help='Column number to analyze (default is last column)')`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`aparser.add_argument('-d', default=None, help='Value used to separate columns')`
			`aparser.add_argument('-x', default=False, action='store_true', help='Output histogram in sixel format')`
			`aparser.add_argument('-s', default=0, help='Number of header lines to skip before processing')`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`aparser.add_argument('-g', default=False, action='store_true', help='Open histogram in separate graphical window')`
			`aparser.add_argument('-o', default=None, help='Output histogram to the specified filename in PNG format')`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`aparser.add_argument('-t', default=False, action='store_true', help='Output a histogram in text format')`
			`aparser.add_argument('-v', default=False, action='store_true', help='Attempt to list nodes relevant to each histogram bar (requires -s, -o, or -t)')`
			`aparser.add_argument('-b', type=int, default=10, help='Number of bins to use in histogram (default is 10)')`
Flesh out stats with arguments 2019-04-25 14:45:47 -04:00			`args = aparser.parse_args(sys.argv[1:])`
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00			`plotdata = []`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`headlines = int(args.s)`
			`while headlines >= 0:`
			`data = sys.stdin.readline()`
			`headlines -= 1`
			`if args.d:`
			`delimiter = args.d`
			`else:`
			`if '\t' in data:`
			`delimiter = '\t'`
			`elif ' ' in data:`
			`delimiter = ' '`
			`elif ',' in data:`
			`delimiter = ','`
			`else:`
			`delimiter = ' ' # handle single column`
			`data = list(csv.reader([data], delimiter=delimiter))[0]`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`nodebydatum = {}`
Add auto-index determination to stats This allows it to auto-skip over units, for example. 2019-05-10 10:34:56 -04:00			`idx = args.c - 1`
			`autoidx = False`
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00			`while data:`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`node = None`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`if ':' in data[0]:`
			`node, data[0] = data[0].split(':', 1)`
			`else:`
			`node = data[0]`
			`if idx == -1 and not autoidx:`
Add auto-index determination to stats This allows it to auto-skip over units, for example. 2019-05-10 10:34:56 -04:00			`while not autoidx:`
			`try:`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`datum = float(data[idx])`
Add auto-index determination to stats This allows it to auto-skip over units, for example. 2019-05-10 10:34:56 -04:00			`except ValueError:`
			`idx -= 1`
			`continue`
			`except IndexError:`
			`sys.stderr.write('Unable to identify a numerical column\n')`
			`sys.exit(1)`
			`autoidx = True`
			`else:`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`datum = float(data[idx])`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`if node:`
			`if datum in nodebydatum:`
			`nodebydatum[datum].add(node)`
			`else:`
			`nodebydatum[datum] = set([node])`
Add a prototype stats command for CLI commands 2019-04-25 13:51:50 -04:00			`plotdata.append(datum)`
			`data = sys.stdin.readline()`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`data = list(csv.reader([data], delimiter=delimiter))[0]`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`n = None`
Add support for CSV formatted data Presume CSV semantics for input. 2019-05-10 11:08:12 -04:00			`if args.g or args.o or args.x:`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`n, bins = plot(args.g, args.o, plotdata, bins=args.b)`
			`if args.t:`
			`n, bins = textplot(plotdata, bins=args.b)`
Add auto-index determination to stats This allows it to auto-skip over units, for example. 2019-05-10 10:34:56 -04:00			`print('Samples: {5} Min: {3} Median: {0} Mean: {1} Max: {4} StandardDeviation: {2} Sum: {6}'.format(np.median(plotdata), np.mean(plotdata), np.std(plotdata), np.min(plotdata), np.max(plotdata), len(plotdata), np.sum(plotdata)))`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`if args.v and n is not None and nodebydatum:`
			`print('')`
			`currbin = bins[0]`
			`bins = bins[1:]`
			`currbinmembers = []`
			`for datum in sorted(nodebydatum):`
Fix the verbose output boundaries. 2019-04-26 16:17:28 -04:00			`if datum > bins[0]:`
Actually fix the verbose range 2019-04-26 16:29:33 -04:00			`nextbin = None`
Fix display of final bin members in verbose 2019-04-26 16:51:44 -04:00			`endbin = bins[0]`
Actually fix the verbose range 2019-04-26 16:29:33 -04:00			`while len(bins) and bins[0] < datum:`
			`nextbin = bins[0]`
Fix the verbose output boundaries. 2019-04-26 16:17:28 -04:00			`bins = bins[1:]`
Fix indentation error 2019-04-26 16:40:47 -04:00			`if not nextbin:`
			`nextbin = np.max(plotdata)`
Fix display of final bin members in verbose 2019-04-26 16:51:44 -04:00			`print('Entries between {0} and {1}'.format(currbin, endbin))`
Have the range be more precise on verbose 2019-04-26 16:43:53 -04:00			`currbin = nextbin`
Fix indentation error 2019-04-26 16:40:47 -04:00			`print('-' * 80)`
			`print(','.join(sorted(currbinmembers)))`
			`print('')`
			`print('')`
			`currbinmembers = []`
Add options to stats Implement verbose, text plot, and custom select bins 2019-04-26 16:04:01 -04:00			`for node in nodebydatum[datum]:`
			`currbinmembers.append(node)`
Fix display of final bin members in verbose 2019-04-26 16:51:44 -04:00			`if currbinmembers:`
			`print('Entries between {0} and {1}'.format(currbin, np.max(plotdata)))`
			`print('-' * 80)`
			`print(','.join(sorted(currbinmembers)))`
			`print('')`
			`print('')`