#https://www.kaggle.com/code/songulerdem/a-b-testing-on-grocery-website-data


import pandas as pd


df= pd.read_csv('grocerywebsiteabtestdata.csv')

df


df = df.groupby(["IP Address", "LoggedInFlag", "ServerID"])["VisitPageFlag"].sum()

df

IP Address  LoggedInFlag  ServerID
0.0.108.2   0             1           0
0.0.109.6   1             1           0
0.0.111.8   0             3           0
0.0.160.9   1             2           0
0.0.163.1   0             2           0
                                     ..
99.9.53.7   1             2           0
99.9.65.2   0             2           0
99.9.79.6   1             2           0
99.9.86.3   0             1           1
99.9.86.9   0             1           0
Name: VisitPageFlag, Length: 99763, dtype: int64


df = df.reset_index(name="VisitPageFlagSum")
df.head()


df["VisitPageFlag"] = df["VisitPageFlagSum"].apply(lambda x: 1 if x != 0 else 0)
df.head()


#df['VisitPageFlag'].value_counts()


df['group'] = df['ServerID'].map({1:'Test', 2:'Control', 3:'Control'})
df.drop(['ServerID','VisitPageFlagSum'],axis=1, inplace=True)


df.head()


df.to_csv('new.csv') #download this data for AB testing


# data Manipulation - first we check information about data if any problems we will fix it.

# import data_manipulation from AB_test
from AB_experiment import data_manipulation

#create alias to call data_manipulation
dm=data_manipulation()

data='new.csv'
column1="group"
column2=["VisitPageFlag"]
quartile1=0.25
quartile3=0.75
info = True
download_df=False
filename='new'

a.data_info(data,column1,column2,quartile1,quartile3,info,download_df,filename)

{'1': ['dataframe_shape', {'Observations': 99763, 'Column': 5}],
 '2': ['missing_data_info', {'No missing values'}],
 '3': ['outliers_info',
  [{'variable_name': 'VisitPageFlag',
    'lower_fence': 0.0,
    'upper_fence': 0.0,
    'Number_of_obs_less_than_lower_fence': 0,
    'Number_of_obs_greater_than_upper_fence': 9978,
    'lower_array': array([], dtype=int64),
    'upper_array': array([  7,  13,  16,  29,  34,  50,  74,  77,  95, 120], dtype=int64)}]],
 '4': ['data_types',
  [{'object_values': "['IP Address', 'group']"},
   {'float_values': '[]'},
   {'int_values': ['Unnamed: 0', 'LoggedInFlag', 'VisitPageFlag']},
   {'bool_val': []}]],
 '5': ['numerical_Variables', ['Unnamed: 0', 'LoggedInFlag', 'VisitPageFlag']],
 '6': ['Categorical_variables', ['IP Address', 'group']],
 '7': [{'Unique values count for variable':    LoggedInFlag
   1         50250
   0         49513},
  {'Unique values count for variable':    VisitPageFlag
   0          89785
   1           9978},
  {'Unique values count for variable':          group
   Control  66460
   Test     33303}],
 '8': ['Descriptive statistics-numerical_Variables',
          Unnamed: 0  LoggedInFlag  VisitPageFlag
  count  99763.00000  99763.000000   99763.000000
  mean   49881.00000      0.503694       0.100017
  std    28799.24179      0.499989       0.300024
  min        0.00000      0.000000       0.000000
  25%    24940.50000      0.000000       0.000000
  50%    49881.00000      1.000000       0.000000
  75%    74821.50000      1.000000       0.000000
  max    99762.00000      1.000000       1.000000,
  '********************',
  'Descriptive statistics-Categorical_variables',
            IP Address    group
  count          99763    99763
  unique         99516        2
  top     146.14.105.1  Control
  freq               2    66460,
  '********************'],
 '9': {'category_stats': [        VisitPageFlag                                   
                   count median      mean       std min max
   group                                                   
   Control         66460    0.0  0.092251  0.289382   0   1
   Test            33303    0.0  0.115515  0.319647   0   1]},
 '10': ['Dataframe',
     Unnamed: 0 IP Address  LoggedInFlag  VisitPageFlag    group
  0           0  0.0.108.2             0              0     Test
  1           1  0.0.109.6             1              0     Test
  2           2  0.0.111.8             0              0  Control
  3           3  0.0.160.9             1              0  Control
  4           4  0.0.163.1             0              0  Control]}


# Above output datatypes of variables LoggedInFlag and VisitPageFlag define wrong 
#hence we change datatypes of variables using change_data_type function 
data='new.csv'
variables=['LoggedInFlag','VisitPageFlag']
dtype=['bool','bool']
drop_variables=['Unnamed: 0']
download_df=True
filename='new'

dm.change_variables(data,variables,dtype,drop_variables,download_df,filename)

[{'Variable1': ['LoggedInFlag', dtype('bool')]},
 {'Variable2': ['VisitPageFlag', dtype('bool')]}]


# After changing data types we chacking agian data_info

data='new.csv'
col1="group"
col2=["VisitPageFlag"]
info = True
quartile1=0.25
quartile3=0.75
download_df=False
filename='new'

dm.data_info(data,col1,col2,info,quartile1,quartile3,download_df,filename)

{'1': ['dataframe_shape', {'Observations': 99763, 'Column': 4}],
 '2': ['missing_data_info', {'No missing values'}],
 '3': ['outliers_info', []],
 '4': ['data_types',
  [{'object_values': "['IP Address', 'group']"},
   {'float_values': '[]'},
   {'int_values': []},
   {'bool_val': ['LoggedInFlag', 'VisitPageFlag']}]],
 '5': ['numerical_Variables', []],
 '6': ['Categorical_variables',
  ['IP Address', 'LoggedInFlag', 'VisitPageFlag', 'group']],
 '7': [{'Unique values count for variable':        LoggedInFlag
   True          50250
   False         49513},
  {'Unique values count for variable':        VisitPageFlag
   False          89785
   True            9978},
  {'Unique values count for variable':          group
   Control  66460
   Test     33303}],
 '8': ['Descriptive statistics-numerical_Variables',
            IP Address LoggedInFlag VisitPageFlag    group
  count          99763        99763         99763    99763
  unique         99516            2             2        2
  top     146.14.105.1         True         False  Control
  freq               2        50250         89785    66460,
  '********************',
  'Descriptive statistics-Categorical_variables',
            IP Address LoggedInFlag VisitPageFlag    group
  count          99763        99763         99763    99763
  unique         99516            2             2        2
  top     146.14.105.1         True         False  Control
  freq               2        50250         89785    66460,
  '********************'],
 '9': {'category_stats': []},
 '10': ['Dataframe',
    IP Address  LoggedInFlag  VisitPageFlag    group
  0  0.0.108.2         False          False     Test
  1  0.0.109.6          True          False     Test
  2  0.0.111.8         False          False  Control
  3  0.0.160.9          True          False  Control
  4  0.0.163.1         False          False  Control]}


# From above output info we can say that in our data there is no outliers , no missing values present 
# and datatypes of all variables correct
# Now we check assumptions for all combinations to perform statistical tests for AB testing 

# import stats_test from AB_test
from AB_experiment import stats_test

#create alias to call stats_test
st=stats_test()

data='new.csv'
sample_size=30000
group="group"
group1_val='Control'
group2_val='Test'
target="VisitPageFlag"
alpha=0.05
paired_data=False 

st.AB_Test_assumption(data, sample_size, group, group1_val, group2_val, target, alpha, paired_data)

({'Target variable is boolean data type': 'Use Chi-Squared Test'},
 {'Note': 'If our data involve time-to-event or survival analysis (e.g., time until a user completes a task), we can use methods such as the log-rank test'})


# perform chi-square test
data='new.csv'
sample_size=30000
col1='group'
col1_value1='Control'
col1_value2='Test'
col2='VisitPageFlag'
alpha=0.05
reverse_experiment=False

st.chi_squared_test(data,sample_size,col1,col1_value1,col1_value2,col2,alpha,reverse_experiment)

{'Status': 'We can reject H0 => group Test is more successful',
 'p_value': 4.9953508084152615e-23,
 'alpha': 0.05,
 'Chi-square statistic': 97.64895544990668,
 '95.0% Confidence interval for the difference in proportions': (-0.02950855925380273,
  -0.019758107412863952)}

	RecordID	IP Address	LoggedInFlag	ServerID	VisitPageFlag
0	1	39.13.114.2	1	2	0
1	2	13.3.25.8	1	1	0
2	3	247.8.211.8	1	1	0
3	4	124.8.220.3	0	3	0
4	5	60.10.192.7	0	2	0
...	...	...	...	...	...
184583	184584	114.8.104.1	0	1	0
184584	184585	207.2.110.5	0	2	1
184585	184586	170.13.31.9	0	2	0
184586	184587	195.14.92.3	0	3	0
184587	184588	172.12.115.8	0	2	1

Grocery Website AB Testing Notebook

Topics

Links

Read more

Grocery Website AB Testing Notebook

Project Goal¶

By checking assumptions we use Chi-Squared Test for AB Testing¶

Conclusion :¶

Articles in

Topic

Read more

	IP Address	LoggedInFlag	ServerID
0	0.0.108.2	0	1
1	0.0.109.6	1	1
2	0.0.111.8	0	3
3	0.0.160.9	1	2
4	0.0.163.1	0	2

	IP Address	LoggedInFlag	group
0	0.0.108.2	0	Test
1	0.0.109.6	1	Test
2	0.0.111.8	0	Control
3	0.0.160.9	1	Control
4	0.0.163.1	0	Control