Machine Learning for daily tasks

9 minute read

Published: August 14, 2015

I was tasked with planning demand for tickets for a complex Application Maintenance System that supports multiple companies. There was some historical data available, and it was invaluable. Using machine learning with sklearn, we were able to predict ticket volumes on a monthly basis with a very high degree of accuracy. Going through the usecase

import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from statsmodels import robust
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
csv_path = "AMS_out_v4.csv"
dataframe = pd.read_csv(csv_path)

Data Discovery:

Quick look at the data

The data is a list of incedent tickets a company had raised after they adopted a COTS software product for every month after go live. Below is a sample of the data and a table that descibes data with row count, pencetile ranges, Max, Mins

dataframe.sample(10)

	Site	Group	Months	Users	tickets	Site (Users)
111	Titans	I	13	88	15	Titans:I:88
87	Bargreens	I	7	119	0	Bargreens:I:119
8	ABC Products	A	11	60	3	ABC Products:A:60
28	ABC Products	A	2	60	24	ABC Products:A:60
10	ABC Products	A	13	60	10	ABC Products:A:60
29	ABC Products	A	3	60	9	ABC Products:A:60
79	Titans	I	4	88	4	Titans:I:88
66	Development Wires	M	6	87	6	Development Wires:M:87
119	Metalastics	I	1	45	14	Metalastics:I:45
118	Bargreens	I	2	119	10	Bargreens:I:119

dataframe.describe()

	Months	Users	tickets
count	145.000000	145.000000	145.000000
mean	7.172414	83.172414	9.689655
std	4.611798	27.544243	7.939456
min	1.000000	27.000000	0.000000
25%	3.000000	67.000000	3.000000
50%	6.000000	82.000000	8.000000
75%	11.000000	101.000000	15.000000
max	16.000000	152.000000	38.000000

dataframe[['Months','Users']].hist()
plt.show();

LOCATION ANALYSIS

Basic Location Estimates:

First we need to learn about the data we have. We do a basic location analysis (Typical Values of or data)

print(f"The mean number of tickets per month from a company: {dataframe['tickets'].mean()}")
print(f"The median number of tickets per month from a company: {dataframe['tickets'].median()}")

The mean number of tickets per month from a company: 9.689655172413794
The median number of tickets per month from a company: 8.0

Estimates based on Month from inception:

The Mean and Median of the number of tickets from the first month onwards Observation: We see that there is an initial surge during the first few months of go live and then after the system stabalizes, ticket volum tapers down.

locationAnalysis=[];
for i in range (1,12):
    analysis_dataframe_month = dataframe[dataframe['Months'] == i]
    mean = analysis_dataframe_month['tickets'].mean();
    median = analysis_dataframe_month['tickets'].median();
    locationAnalysis.append([i,mean,median]);
fig, ax = plt.subplots()
df = pd.DataFrame(locationAnalysis, columns=['month', 'mean','median'])
df.plot(x='month', ax=ax)
plt.show()

Trend Analysis from sample companies

Ticket volumes for sample companies based month wise from inception. The inital surge and tappering down after stablization is evident even for inividual companies.

def add_values(row):
    return row['Site'] + ":" + row['Group']  + ":" + str(row['Users'])

siteDictionary={};
columnsForPlotting=["Months"];
dataframe['Site (Users)'] = dataframe.apply(add_values, axis=1)
for ind in dataframe.index:
    if dataframe['Site (Users)'][ind] not in siteDictionary.keys():
        # this will have key=month and value=ticketCount
        ticketsDictionary={dataframe['Months'][ind] : dataframe['tickets'][ind]};
            # this will have key=SiteUser and value=ticketsDictionary
        siteDictionary[dataframe['Site (Users)'][ind]]=ticketsDictionary;
        columnsForPlotting.append(dataframe['Site (Users)'][ind]);
    else:
        ticketsDictionary=siteDictionary.get(dataframe['Site (Users)'][ind]);
        ticketsDictionary[dataframe['Months'][ind]]=dataframe['tickets'][ind];
    
tArray=[];
for site in columnsForPlotting[1:]:
    ticketsDictionary=siteDictionary.get(site);
    for month in range(1,12):
        if month in ticketsDictionary.keys():
            tArray.append([site,month,ticketsDictionary.get(month)]);
        else:
            tArray.append([site,month,'']);

df2 = pd.DataFrame(tArray,columns=['site', 'month', 'tickets'])
df2 = df2.pivot(index='month', columns='site', values='tickets')

fig, ax = plt.subplots()
df2.plot(ax=ax)
plt.show()

Estimates based on user size:

Scatter plot is used to indicate the ticket volume and company user size.

def makeDFWithMeanMedian(dataframe:DataFrame) -> DataFrame:
    # The plot generated has named values.
    # Converted a tupple to a data frame
    dictionaryDF={};
    for ind in dataframe.index:
        if dataframe['Users'][ind] not in dictionaryDF.keys():
            tickets=[dataframe['Site (Users)'][ind],dataframe['tickets'][ind]];
            dictionaryDF[dataframe['Users'][ind]]=tickets;
        else:
            tickets = dictionaryDF.get(dataframe['Users'][ind]);
            tickets.append(dataframe['tickets'][ind]);
    
    locations=[];
    for key,value in dictionaryDF.items():
        # appending a tupple to the list which is needed to change to dataframe easily
        locations.append((key,value[0],np.mean(value[1:]),np.median(value[1:])));
    # values,  # 1st column as index # 1st row as the column names
    # print(locations);
    newDataFrame = pd.DataFrame(locations, columns =['Users', 'Site (Users)', 'mean','median']);
    return newDataFrame;

newDataFrame=makeDFWithMeanMedian(dataframe)
# open figure + axis
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
    # plot
ax.scatter(x=newDataFrame['Users'],y=newDataFrame['mean'],c='DarkBlue')
# set labels
ax.set_xlabel('Users')
ax.set_ylabel('Mean Tickets')

# annotate points in axis   
for idx, row in newDataFrame.iterrows():
    ax.annotate(row['Site (Users)'], (row['Users'], row['mean']) )
# force matplotlib to draw the graph
plt.show()

LocationAnalysis: User Size and Group

Since there is the group impacts.

dataframe1=dataframe[dataframe['Group'] == 'A'];
dataframe2=dataframe[dataframe['Group'] == 'M'];
dataframe3=dataframe[dataframe['Group'] == 'I'];
dictionary={'A':dataframe1,'I':dataframe2,'M':dataframe3};
fig, ax = plt.subplots()
fig.set_size_inches(4, 4)
for key, dataframein in dictionary.items():
    locationAnalysis=[];
    flag = True;
    for i in range (1,10):
        analysis_dataframe_month = dataframein[dataframein['Months'] == i]
        mean = analysis_dataframe_month['tickets'].mean();
        locationAnalysis.append([i,mean]);
    df = pd.DataFrame(locationAnalysis, columns=['month', 'mean'])
    df=df.rename(columns={"mean": key})
    df.plot(x='month',ax=ax)
plt.ylabel('Mean Ticket Volume')
plt.xlabel('Month')
plt.title('Group Wise Trends')
plt.show();

Company Variant Analysis

Since the tickets are impacted by the type of compamy group they belong to, we are ploting a linar line to deomstrate the same

dataframe1=dataframe[dataframe['Group'] == 'A'];
newDataFrame=makeDFWithMeanMedian(dataframe1)
regressor = LinearRegression();
regressor.fit(newDataFrame[['Users']],newDataFrame[['mean']])
fig, ax = plt.subplots()
plt.scatter(newDataFrame['Users'],newDataFrame['mean'], color='red')
plt.plot(newDataFrame['Users'], regressor.predict(newDataFrame[['Users']]),color='red')
fig.set_size_inches(12, 8)
dataframe1=dataframe[dataframe['Group'] == 'I'];
newDataFrame=makeDFWithMeanMedian(dataframe1)
regressor = LinearRegression();
regressor.fit(newDataFrame[['Users']],newDataFrame[['mean']])
plt.scatter(newDataFrame['Users'],newDataFrame['mean'], color='blue')
plt.plot(newDataFrame['Users'], regressor.predict(newDataFrame[['Users']]),color='blue')

[<matplotlib.lines.Line2D at 0x135c1e890>]

Data Variability & Distribution Analysis For Range 0 - 4 Months and 6 -12 Months

def dataVariabilityAnalysisInitialSurge(dataframe:DataFrame,  start:int, end:int):
    print(f"Data Variability Analysis For Range {start} and {end}")
    dataframe = dataframe[dataframe['Months'] > start]
    analysis_dataframe_month = dataframe[dataframe['Months'] < end]
    #print(analysis_dataframe_month);
    standardDeviation = analysis_dataframe_month['tickets'].std()
    print(f"Standard Deviation: {standardDeviation}")
    interQuartilerange= analysis_dataframe_month['tickets'].quantile(0.75) - analysis_dataframe_month['tickets'].quantile(0.25)
    print(f"Inter Quartile Range (difference between the 25th percentile and the 75th percentile): {interQuartilerange}")
    robustMAD=robust.scale.mad(analysis_dataframe_month['tickets'])
    print(f"The robust mean absolute deviation:{robustMAD}")
    percentile = analysis_dataframe_month['Months'].quantile([0.05, 0.25, 0.5, 0.75, 0.95])
    print(f"Distribution Analysis For Range {start} and {end}")
    print(f"Percentile 0.05, 0.25, 0.5, 0.75, 0.95 \n{percentile}")
    fig = plt.figure() 
    fig.set_size_inches(4, 4)
    ax = (analysis_dataframe_month['tickets']).plot.box()
    label = f"Box Plot for range For Range {start} and {end}";
    ax.set_ylabel(label)
    plt.show()
dataVariabilityAnalysisInitialSurge(dataframe, 0, 4);

Data Variability Analysis For Range 0 and 4
Standard Deviation: 7.751999479809022
Inter Quartile Range (difference between the 25th percentile and the 75th percentile): 8.75
The robust mean absolute deviation:6.671709983275209
Distribution Analysis For Range 0 and 4
Percentile 0.05, 0.25, 0.5, 0.75, 0.95 
0.05    1.0
0.25    1.0
0.50    2.0
0.75    3.0
0.95    3.0
Name: Months, dtype: float64

dataVariabilityAnalysisInitialSurge(dataframe, 6, 12)

Data Variability Analysis For Range 6 and 12
Standard Deviation: 3.3942060810497234
Inter Quartile Range (difference between the 25th percentile and the 75th percentile): 4.25
The robust mean absolute deviation:2.965204437011204
Distribution Analysis For Range 6 and 12
Percentile 0.05, 0.25, 0.5, 0.75, 0.95 
0.05     7.0
0.25     8.0
0.50     9.0
0.75    10.0
0.95    11.0
Name: Months, dtype: float64

doGroupUserAnalysis

newDataFrame=makeDFWithMeanMedian(dataframe);
print(newDataFrame);
newDataFrame = newDataFrame.drop(['Site (Users)','median'], axis=1)
kmeans = KMeans(n_clusters=3, random_state=42);
# Fit the model to your data
X=newDataFrame.to_numpy();
cluster_labels = kmeans.fit_predict(X);

# Create a scatter plot for the first two features
fig=plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data with Clusters')
plt.colorbar(label='Cluster')
plt.show();

    Users                 Site (Users)       mean  median
    67              AC Freight:A:67  18.000000    17.0
    60            ABC Products:A:60   8.214286     7.0
    82           CargoControls:A:82  19.230769    19.0
    80           TMWVSolutions:A:80  14.428571    15.0
    27  National Manufacturing:M:27   1.714286     2.0
    87       Development Wires:M:87   7.444444     6.0
    91           National Goup:M:91   7.777778     7.0
   115         Plant Controls:I:115   7.846154     6.0
   119              Bargreens:I:119   3.066667     2.0
    88                  Titans:I:88   6.500000     4.5
   45             Metalastics:I:45   3.800000     2.0
  152             MontrealCo:M:152  18.500000    20.0
  102             OntorioCom:M:102  17.750000    19.5
  101               InterRio:I:101  16.666667    16.0

Prediction

We used a simple pipeline with Support Vector Regression (SVR) to predict ticket volumes. By considering user count, group type, and the number of companies that had gone live or were stable in the past few months, we were able to accurately forecast the number of tickets each month. This approach was also very useful for predicting ticket volumes for a company during its first few months, assessing system stability during hypercare, and determining readiness for transition.

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, StratifiedKFold
dataset = pd.read_csv('AMS_out_v4.csv')

X_train, X_test, y_train, y_test = train_test_split(dataset[['Months','Users','Group']], dataset['tickets'].values, test_size=0.1, random_state=99)

multicolumn_prep = ColumnTransformer([
                                    ('encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Group']),
                                    ('scaler', StandardScaler(), ['Months','Users'])
                                    ],
                                     remainder='passthrough')
space_titanic_pipeline = Pipeline([('preprocessing', multicolumn_prep),
                                    ('model',SVR(kernel="rbf", C=100, gamma="auto",epsilon=0.0001, coef0 =0.001,degree=2))])

model = space_titanic_pipeline.fit(X_train, y_train);
y_svr_pred = model.predict(X_test);

temp = pd.DataFrame();
temp = X_test;
temp['Actual']=y_test
temp['Predicted']=y_svr_pred

def analyzeReults(y_test,y_pred):
    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    print(f"The root mean square error is {rmse}");
    r2 = r2_score(y_test,y_pred)
    print(f"The r2 score is {r2}");
    resultActualDF=pd.DataFrame()
    resultPredDF=pd.DataFrame()
    index = np.arange(start=1, stop= len(y_test)+1)
    pd.Series(index);
    resultActualDF["tickets"] = y_test;
    resultPredDF["tickets"]= y_pred;
    plt.scatter(pd.Series(index), resultActualDF,pd.Series(index))
    plt.scatter(pd.Series(index), resultPredDF,pd.Series(index))
    plt.show(); 


analyzeReults(y_test,y_svr_pred);

scores = cross_val_score(model, X_train, y_train, scoring="r2", cv=8)
print(f"scores:{scores}")
print(f"mean score {scores.mean()}")
print(f"std deviation of score {scores.std()}");
print("Comparing Actual and Predicted.")
temp

The root mean square error is 3.2867599548019504
The r2 score is 0.5762503530142633

scores:[0.38391399 0.58976798 0.52677144 0.46689801 0.28780864 0.69530423
 0.41316553 0.4792549 ]
mean score 0.4803605905328669
std deviation of score 0.11800453109629407

	Months	Users	Group	Actual	Predicted
101	16	115	I	6	2.847273
8	11	60	A	3	5.295125
57	2	27	M	3	1.600696
98	10	119	I	2	2.051616
86	6	88	I	3	4.144913
81	11	115	I	3	2.489424
69	7	91	M	7	5.933006
131	4	115	I	16	8.811666
70	7	27	M	2	0.939125
9	12	60	A	3	4.984632
16	13	82	A	20	14.834038
4	14	67	A	8	9.062198
64	5	87	M	6	6.707223
120	3	119	I	4	9.618984
102	11	119	I	7	1.946395

Share on

Twitter Facebook LinkedIn

Kris NUNES