# Libraries to manipulate the data
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
from app_pass import dbpass

# Library to deploy charts with the data
import seaborn as sns
import matplotlib.pyplot as plt

# Statmodels for predictions
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Connect to our MySQL database
import mysql.connector
from sqlalchemy import create_engine


# This is to ignore warnings.
import warnings
warnings.filterwarnings('ignore')

# Cargamos nuestra tabla de empleados
df_rawemp = pd.read_excel('../hotel_hranalytics/hotel_hranalytics.xlsx', sheet_name='Employees', converters={'emp_id':str})

df_rawemp.head()

df_rawht = pd.read_excel('../hotel_hranalytics/hotel_hranalytics.xlsx', sheet_name='Hotels')

df_rawht.head()

df_rawhtcomp = pd.read_excel('../hotel_hranalytics/hotel_hranalytics.xlsx', sheet_name='Hotel_Composition')

df_rawhtcomp.head()

df_rawempwages = pd.read_excel('../hotel_hranalytics/hotel_hranalytics.xlsx', sheet_name='Employees_wages')

df_rawempwages.head()

df_rawworkforce = pd.read_excel('../hotel_hranalytics/hotel_hranalytics.xlsx', sheet_name='Workforce_Composition')

df_rawworkforce.head()

# Combinando todas las tablas
df_combined = pd.concat([df_rawemp, df_rawht, df_rawhtcomp, df_rawworkforce, df_rawempwages])

df_combined.dtypes

emp_id                       object
Name                         object
Surname                      object
Birthday             datetime64[ns]
Age                         float64
Gender                       object
on_license                  float64
hotel_id                     object
Location                     object
Opening                      object
Stars                       float64
Budget                      float64
hc_id                        object
Department                   object
Active_employees            float64
Emp_with_license            float64
Total_employees             float64
wkc_id                       object
Position                     object
years_at_position           float64
Entry_date           datetime64[ns]
years_working               float64
Staff                       float64
emp_wag_id                   object
Price_$_Hour                float64
Hours_worked                float64
Work_overtime               float64
Ovh$_75%                    float64
Gross_pay                   float64
Deductions_3%               float64
Total_Payment               float64
Payment_date         datetime64[ns]
dtype: object

# Tabla Employees 
df_rawemp[['Age', 'on_license']].apply(pd.to_numeric)
df_rawemp[['hotel_id']].astype('str')

# Tabla Hotels
df_rawht['Stars'].apply(pd.to_numeric)
df_rawht[['hotel_id']].astype('str')
df_rawht.rename(columns={'Stars': 'Stars_type'}, inplace=True)

# Tabla Hotel Composition
df_rawhtcomp[['Active_employees', 'Emp_with_license', 'Total_employees']].apply(pd.to_numeric)
df_rawhtcomp[['hc_id', 'hotel_id']].astype('str')

# Tabla Workforce Composition
df_rawworkforce[['years_at_position', 'years_working', 'Staff']].apply(pd.to_numeric)
df_rawworkforce[['wkc_id', 'emp_id', 'hotel_id', 'hc_id']].astype('str')

# Tabla Employees Wages
df_rawempwages[['emp_wag_id', 'emp_id', 'hotel_id',	'hc_id']].astype('str')
df_rawempwages[['Price_$_Hour']].apply(pd.to_numeric)
df_rawempwages.rename(columns={'Ovh$_75%': 'Ovh$_75', 'Deductions_3%': 'Deductions_3'}, inplace=True)

df_rawemp.dtypes

emp_id                object
Name                  object
Surname               object
Birthday      datetime64[ns]
Age                    int64
Gender                object
on_license             int64
hotel_id              object
dtype: object

df_rawhtcomp.dtypes

hc_id               object
Department          object
Active_employees     int64
Emp_with_license     int64
Total_employees      int64
hotel_id            object
dtype: object

df_rawworkforce = df_rawworkforce.rename(columns={'Position': 'Positions'})
df_rawworkforce.dtypes

wkc_id                       object
Department                   object
Positions                    object
years_at_position             int64
Entry_date           datetime64[ns]
years_working                 int64
Staff                         int64
emp_id                        int64
hotel_id                     object
hc_id                        object
dtype: object

df_rawempwages.dtypes

emp_wag_id               object
Price_$_Hour              int64
Hours_worked              int64
Work_overtime             int64
Ovh$_75                 float64
Gross_pay               float64
Deductions_3            float64
Total_Payment           float64
emp_id                    int64
hotel_id                 object
hc_id                    object
Payment_date     datetime64[ns]
dtype: object

# Verificando si tenemos valores nulos
missing_values = df_rawemp.isnull().sum()
print('Los valores que faltan son: ', missing_values)

Los valores que faltan son:  emp_id        0
Name          0
Surname       0
Birthday      0
Age           0
Gender        0
on_license    0
hotel_id      0
dtype: int64

# Trabajamos con la columna de Género, pero antes vamos a crear una variable con el total de filas de la tabla
emp_length = len(df_rawemp)
print('El total de registros que tenemos en la tabla de Empleados es: ', emp_length)

El total de registros que tenemos en la tabla de Empleados es:  505

# Contamos los valores de la columna Gender
emp_gender = df_rawemp['Gender'].value_counts()
print(emp_gender)

Gender
M    267
F    238
Name: count, dtype: int64

# Preparando los datos para el gráfico
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(emp_gender)))
labels = 'Hombres', 'Mujeres'

# Creando el gráfico de Torta
fig, ax = plt.subplots(figsize=(6, 8))
ax.pie(emp_gender, colors=['#FDE74C', '#E3655B'], autopct='%1.1f%%', center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"})
ax.legend(labels, loc='lower right', title='Géneros')
ax.set_title('Distribución del Género', fontsize=16)

plt.show()

# Preparando los datos para el gráfico
df_rawworkforce['emp_id'] = df_rawworkforce['emp_id'].astype('str')
emp_gender_by_dep = pd.merge(df_rawemp, df_rawworkforce, on='emp_id', how='inner')
emp_gender_by_dep.head()

# Eliminamos las columnas que no vamos a utilizar
emp_gender_by_dep.drop(columns=['Name', 'Surname', 'Birthday', 'wkc_id', 'years_at_position', 'years_working', 'Entry_date', 'hotel_id_y', 'hc_id', 'Staff'], inplace=True)
emp_gender_by_dep.head()

# Revisamos si tenemos valores nulos
gender_by_dep_missing_values = emp_gender_by_dep.isnull().sum()
print('Los valores que faltan son: ', gender_by_dep_missing_values)

Los valores que faltan son:  emp_id        0
Age           0
Gender        0
on_license    0
hotel_id_x    0
Department    0
Positions     0
dtype: int64

# Preparando los datos para el gráfico
gender_dist = emp_gender_by_dep.groupby(['hotel_id_x', 'Department', 'Gender']).size().reset_index()
gender_dist.rename(columns={0: 'Count'}, inplace=True)
print(gender_dist)

# Convertimos el tipo de dato a número de la columna Count 
gender_dist['Count'] = gender_dist['Count'].astype('int64')
gender_dist['Count'].dtypes

# Usamos Seaborn para crear un gráfico de barras con FacetGrid
grid = sns.FacetGrid(
    gender_dist,
    col='hotel_id_x',
    height=6,
    aspect=1.5,
    sharey=False
)

# Dibujamos el gráfico de barras
grid.map_dataframe(
    sns.barplot,
    y='Department',
    x='Count',
    hue='Gender',
    palette=['#E3655B','#FDE74C']
)

# Añadimos el conteo a cada barra
for ax in grid.axes.flat:  
    for container in ax.containers:  
        for bar in container:  
            bar_value = bar.get_width()  
            if bar_value > 0:  
                ax.text(
                    bar_value + 0.5,  
                    bar.get_y() + bar.get_height() / 2,  
                    f"{int(bar_value)}",  
                    ha='left',  
                    va='center',  
                    fontsize=9,  
                    color='black'  
                )

# Titulos y leyendas
grid.add_legend(title='Género')
grid.legend.set_loc('upper right')
grid.set_titles('Hotel {col_name}')
grid.set_axis_labels('Nro de Empleados', 'Departamentos')
plt.tight_layout()
plt.title('Distribución de género por departamento y hotel')

plt.show()

   hotel_id_x              Department Gender  Count
0      ACECWR                      3R      F      3
1      ACECWR               Animation      F     12
2      ACECWR          Bar_Restaurant      F     38
3      ACECWR          Floors_Laundry      M     38
4      ACECWR                 Kitchen      F      8
5      ACECWR                 Kitchen      M     30
6      ACECWR                   Other      F      6
7      ACECWR  Reception_Reservations      M     12
8      ACECWR                     SPA      F      6
9      ACECWR      Technical_Services      F     12
10     FUESSP                      3R      F      4
11     FUESSP               Animation      F     12
12     FUESSP          Bar_Restaurant      F     31
13     FUESSP          Bar_Restaurant      M      7
14     FUESSP          Floors_Laundry      M     38
15     FUESSP                 Kitchen      M     38
16     FUESSP                   Other      F      6
17     FUESSP  Reception_Reservations      M     12
18     FUESSP                     SPA      F      6
19     FUESSP      Technical_Services      F     12
20     TFNOBH                      3R      F      4
21     TFNOBH               Animation      F     12
22     TFNOBH          Bar_Restaurant      F     41
23     TFNOBH          Floors_Laundry      M     39
24     TFNOBH                 Kitchen      M     40
25     TFNOBH                   Other      F      6
26     TFNOBH  Reception_Reservations      M     13
27     TFNOBH                     SPA      F      6
28     TFNOBH      Technical_Services      F     13

# Preparamos los datos
gender_by_dep = emp_gender_by_dep.groupby(['Department', 'Gender']).size().reset_index(name='Count')
print(gender_by_dep)

# Creamos el gráfico de barras
plt.figure(figsize=(12, 8))
ax = sns.barplot(
    data=gender_by_dep,
    x='Count',
    y='Department',
    hue='Gender',
    palette=['#E3655B','#FDE74C'],
    ci=None
)

# Añadimos el conteo a cada barra
for container in ax.containers: 
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=10, color='black')

# Dibujamos el gráfico
plt.title('Distribución del Género por Departamentos', fontsize=16)
plt.xlabel('Número de Empleados', fontsize=12)
plt.ylabel('Departamento', fontsize=12)
plt.legend(title='Género')
plt.tight_layout()


plt.show()

                Department Gender  Count
0                       3R      F     11
1                Animation      F     36
2           Bar_Restaurant      F    110
3           Bar_Restaurant      M      7
4           Floors_Laundry      M    115
5                  Kitchen      F      8
6                  Kitchen      M    108
7                    Other      F     18
8   Reception_Reservations      M     37
9                      SPA      F     18
10      Technical_Services      F     37

# Creamos una función para clasificar la edad en rangos
def age_range(age):
    if age >= 18 and age <= 27:
        return '18 to 27'
    elif age >= 28 and age <= 37:
        return '28 to 37'
    elif age >= 38 and age <= 47:
        return '38 to 47'
    elif age >= 48 and age <= 57:
        return '48 to 57'
    else:
        return 'more than 58'

# Aplicamos la función a la columna Age    
emp_age_range = df_rawemp['Age'].apply(lambda x: pd.Series(age_range(x)))

# Creamos una nueva columna con los rangos de edad
df_rawemp['Age_range'] = emp_age_range

# Chequeamos los valores del rango de edad
age_range_count = df_rawemp['Age_range'].value_counts().sort_index()
total_agerange_count = age_range_count.sum()
percentage = (age_range_count / total_agerange_count) * 100

# Creamos un gráfico de torta
fig, ax = plt.subplots(figsize=(12, 10))
colors = ['#E3655B', '#DB5461', '#FDE74C', '#4C5B5C', '#3891A6']
labels = [f'{age} ({count})' for age, count in zip(age_range_count.index, age_range_count)]
labels_sort = df_rawemp['Age_range'].value_counts().sort_index()
graph_labels = '18 to 27', '28 to 37', '38 to 47', '48 to 57', 'more than 58'

ax.pie(age_range_count, autopct='%1.1f%%', center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"}, startangle=90, colors=colors)

plt.legend(labels, loc='upper right', title='Rangos de Edad')

ax.set_title('Distribución de Edad por rangos', fontsize=16)

Text(0.5, 1.0, 'Distribución de Edad por rangos')

# Distribución de Edad por Hotel
agerange_by_hotel = df_rawemp.groupby(['hotel_id', 'Age_range']).size().unstack()
print(agerange_by_hotel)

# Creando el gráfico de torta
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

colors = ['#E3655B', '#DB5461', '#FDE74C', '#4C5B5C', '#3891A6']

for i, hotel in enumerate(agerange_by_hotel.index):
    ax = axs[i]
    labelsbyhotel = [f'{age} ({count})' for age, count in zip(age_range_count.index, agerange_by_hotel.loc[hotel])]
    ax.pie(agerange_by_hotel.loc[hotel], autopct='%1.1f%%', center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"}, startangle=90, colors=colors)
    ax.set_title(f'Distribución de Edad en el {hotel}', fontsize=14)
    ax.legend(labelsbyhotel, loc='upper right', title='Rango de Edad')

plt.tight_layout()
plt.show()

Age_range  18 to 27  28 to 37  38 to 47  48 to 57  more than 58
hotel_id                                                       
ACECWR           17        31        43        38            36
FUESSP           24        35        29        32            46
TFNOBH           26        40        32        35            41

df_rawemp['Age'].describe().round()

count    505.0
mean      45.0
std       14.0
min       21.0
25%       33.0
50%       45.0
75%       57.0
max       69.0
Name: Age, dtype: float64

# Para ver los nombres de los hoteles
hotel_names = df_rawht.set_index('hotel_id')['Name']

# Edad promedio de los empleados por hotel
avg_age_byhotel = df_rawemp.groupby('hotel_id')['Age'].mean().round()

# Años promedio de trabajo de los empleados por hotel
avg_working_years = df_rawworkforce.groupby('hotel_id')['years_working'].mean().round()

# Imprimir los nombres de los hoteles en los resultados
avg_age_byhotel.index = avg_age_byhotel.index.map(lambda x: f"{x} ({hotel_names[x]})")
avg_working_years.index = avg_working_years.index.map(lambda x: f"{x} ({hotel_names[x]})")

# Para eliminar que figure el nombre del índice
avg_age_byhotel.index.name = None
avg_working_years.index.name = None

print('La edad promedio de los empleados es: ','\n', avg_age_byhotel, '\n\n',
    'El promedio de años de trabajo de los empleados es: ','\n', avg_working_years, '\n\n',
)

La edad promedio de los empleados es:  
 ACECWR (Coral Wave Resort)     45.0
FUESSP (Sandy Shores Park)     45.0
TFNOBH (Ocean Breeze Haven)    44.0
Name: Age, dtype: float64 

 El promedio de años de trabajo de los empleados es:  
 ACECWR (Coral Wave Resort)     12.0
FUESSP (Sandy Shores Park)     12.0
TFNOBH (Ocean Breeze Haven)    14.0
Name: years_working, dtype: float64

# Números de empleados con licencia
emp_on_license = df_rawemp[(df_rawemp['on_license'] == True)].count()
emp_on_license_byhotel = df_rawemp.groupby('hotel_id')['on_license'].sum()
per_emp_on_license = (emp_on_license['on_license'] / emp_length) * 100

# Imprimir los nombres de los hoteles en los resultados
emp_on_license_byhotel.index = emp_on_license_byhotel.index.map(lambda x: f"{x} ({hotel_names[x]})")

# Eliminar el nombre del índice
emp_on_license_byhotel.index.name = None

print('El número de empleados con licencia es: ','\n', emp_on_license_byhotel, '\n\n',
    'El total de empleados con licencia es: ', emp_on_license['on_license'], '\n\n',
    'El porcentaje de empleados con licencia es: ', per_emp_on_license.round(2), '%'
)

El número de empleados con licencia es:  
 ACECWR (Coral Wave Resort)     28
FUESSP (Sandy Shores Park)     18
TFNOBH (Ocean Breeze Haven)    27
Name: on_license, dtype: int64 

 El total de empleados con licencia es:  73 

 El porcentaje de empleados con licencia es:  14.46 %

# Preparamos los datos para el gráfico
on_license_count = df_rawemp['on_license'].value_counts()
print(on_license_count)

# Creamos un gráfico de barras
fig, ax = plt.subplots(figsize=(6, 8))

labels = 'No', 'Yes'
colors = '#FDE74C', '#E3655B'

ax.bar(labels, on_license_count, color=colors)
ax.bar_label(ax.containers[0], fontsize=10)

plt.title('Empleados con Licencia', fontsize=16)
plt.ylabel('Número de Empleados')
plt.xlabel('Licencia')
plt.show()

on_license
0    432
1     73
Name: count, dtype: int64

# Preparando nuestros datos
onlicense_by_hotel = df_rawemp.groupby(['hotel_id', 'on_license']).size().unstack()
print(onlicense_by_hotel)

# Creando los gráficos de torta
fix, axs = plt.subplots(1, 3, figsize=(18, 6))
labels_onlicense = 'No', 'Yes'

for i, hotel in enumerate(onlicense_by_hotel.index):
    ax = axs[i]
    labels2 = [f'{age} ({count})' for age, count in zip(labels_onlicense, onlicense_by_hotel.loc[hotel])]
    ax.pie(onlicense_by_hotel.loc[hotel], autopct='%1.1f%%', center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"}, startangle=90, colors=['#FDE74C', '#E3655B'])
    ax.set_title(f'Empleados con licencia por hotel {hotel}', fontsize=14)
    ax.legend(labels2, loc='upper right', title='Licencia')

plt.tight_layout()
plt.show()

on_license    0   1
hotel_id           
ACECWR      137  28
FUESSP      148  18
TFNOBH      147  27

# Trabajamos con la tabla employees wages
# Para un mejor análisis primero fusionamos las tablas de Employees Wages y Workforce Composition
df_rawempwages['emp_id'] = df_rawempwages['emp_id'].astype('str')
emp_wages_wfc = pd.merge(df_rawempwages, df_rawworkforce, on='emp_id', how='inner')
emp_wages_wfc.head()

# Eliminamos las columnas que no vamos a utilizar
emp_wages_wfc.drop(columns=['hotel_id_y', 'hc_id_y', 'wkc_id', 'years_at_position', 'Entry_date', 'years_working', 'Staff'], inplace=True)
emp_wages_wfc.head()

# Verificamos si tenemos valores nulos
new_missing_values = emp_wages_wfc.isnull().sum()
print('Los valores que faltan son: ', new_missing_values)

Los valores que faltan son:  emp_wag_id       0
Price_$_Hour     0
Hours_worked     0
Work_overtime    0
Ovh$_75          0
Gross_pay        0
Deductions_3     0
Total_Payment    0
emp_id           0
hotel_id_x       0
hc_id_x          0
Payment_date     0
Department       0
Positions        0
dtype: int64

avg_hour_price = emp_wages_wfc['Price_$_Hour'].mean().round()
avg_hours_worked = emp_wages_wfc['Hours_worked'].mean().round()
emp_wages_wfc['total_paid_NH'] = emp_wages_wfc['Hours_worked'] * emp_wages_wfc['Price_$_Hour']
total_paid_NH = emp_wages_wfc['total_paid_NH'].sum()
avg_OT_hours_worked = emp_wages_wfc['Work_overtime'].mean().round()
total_OT_hours = emp_wages_wfc['Work_overtime'].sum()
emp_wages_wfc['total_paid_OT'] = emp_wages_wfc['Work_overtime'] * emp_wages_wfc['Ovh$_75']
total_paid_OT = emp_wages_wfc['total_paid_OT'].sum()

print('El precio promedio por hora: ','€', avg_hour_price, '\n\n',
    'El promedio de horas trabajadas por nuestros empleados es: ', avg_hours_worked, '\n\n',
    'La SUMA total que pagamos por horas normales es: ', '€', total_paid_NH, '\n\n',
    'El promedio de horas extras trabajadas por nuestros empleados es: ', avg_OT_hours_worked, '\n\n',
    'El total de horas extra trabajadas es: ', total_OT_hours, '\n\n'
    'La SUMA total pagada por horas extras: ', '€', total_paid_OT
)

El precio promedio por hora:  € 15.0 

 El promedio de horas trabajadas por nuestros empleados es:  140.0 

 La SUMA total que pagamos por horas normales es:  € 12950211 

 El promedio de horas extras trabajadas por nuestros empleados es:  6.0 

 El total de horas extra trabajadas es:  33501 

La SUMA total pagada por horas extras:  € 382709.25

# Visualizamos el porcentaje de horas trabajadas por nuestros empleados
# Debemos calcular el total de horas trabajadas
total_hours_worked = emp_wages_wfc['Hours_worked'].sum() + emp_wages_wfc['Work_overtime'].sum()
per_NH = (emp_wages_wfc['Hours_worked'].sum() / total_hours_worked) * 100
per_OTh = (emp_wages_wfc['Work_overtime'].sum() / total_hours_worked) * 100
print('The total hours worked by our employees is: ', total_hours_worked, '\n\n',
      'Percentage Normal Hours: ', per_NH.round(2), '\n\n',
      'Percentage Over Time Hours: ', per_OTh.round(2)
      )

# Con los datos anteriores, creamos un gráfico de torta
hours = [per_NH, per_OTh]
colors = ['#3891A6', '#4C5B5C']
labels = ['Horas Normales', 'Horas Extras']
# Creating the PIE CHART
fig, ax = plt.subplots(figsize=(6, 8))
ax.pie(hours, colors=colors, autopct='%1.1f%%', center=(4, 4), wedgeprops={"linewidth": 1, "edgecolor": "white"})
ax.legend(labels, loc='upper left', title='Hours Worked')
ax.set_title('Distribución de las horas', fontsize=16)

plt.show()

The total hours worked by our employees is:  882410 

 Percentage Normal Hours:  96.2 

 Percentage Over Time Hours:  3.8

# Preparamos los datos para el gráfico
monthly_payment_by_hotel = emp_wages_wfc.groupby([pd.Grouper(key='Payment_date', freq='M'), 'hotel_id_x']).agg({
    'total_paid_NH': 'sum',
}).reset_index()

# Creamos un gráfico de líneas
plt.figure(figsize=(12, 8))
sns.lineplot(
    data=monthly_payment_by_hotel, 
    x='Payment_date',
    y='total_paid_NH', 
    hue='hotel_id_x', 
    palette=['#DB5461', '#FDE74C', '#3891A6'],
    marker='o',
    )

for hotel in monthly_payment_by_hotel['hotel_id_x'].unique():
    hotel_data = monthly_payment_by_hotel[monthly_payment_by_hotel['hotel_id_x'] == hotel]
    for x, y in zip(hotel_data['Payment_date'], hotel_data['total_paid_NH']):
        plt.text(x, y, f'{y:.0f}', fontsize=9, ha='center', va='bottom')

plt.title('Distribución de Pagos Mensuales por Hotel', fontsize=16)
plt.xlabel('Meses')
plt.ylabel('Total Pagado (€)')
plt.xticks(rotation=45)
plt.legend(title='Hotels ID')
plt.tight_layout()

plt.show()

# Preparamos los datos para el gráfico
monthly_payment = emp_wages_wfc.groupby(pd.Grouper(key='Payment_date', freq='M')).agg({'total_paid_NH': 'sum'}).reset_index()

# Creamos un gráfico de líneas
plt.figure(figsize=(12, 8))
sns.lineplot(
    data=monthly_payment, 
    x='Payment_date',
    y='total_paid_NH',
    palette=['#3891A6'],
    marker='o',
    )

for hotel in monthly_payment:
    hotel_data = monthly_payment
    for x, y in zip(hotel_data['Payment_date'], hotel_data['total_paid_NH']):
        plt.text(x, y, f'{y:.0f}', fontsize=9, ha='center', va='bottom')

plt.title('Distribución Total del Pago Mensual', fontsize=16)
plt.xlabel('Meses')
plt.ylabel('Total Pagado (€)')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

# Preparamos los datos para el gráfico de horas extras
monthly_payment_by_hotel_ot = emp_wages_wfc.groupby([pd.Grouper(key='Payment_date', freq='M'), 'hotel_id_x']).agg({
    'total_paid_OT': 'sum',
}).reset_index()

# Creamos un gráfico de líneas
plt.figure(figsize=(12, 8))
sns.lineplot(
    data=monthly_payment_by_hotel_ot, 
    x='Payment_date',
    y='total_paid_OT', 
    hue='hotel_id_x', 
    palette=['#DB5461', '#FDE74C', '#3891A6'],
    marker='o',
    )

for hotel in monthly_payment_by_hotel_ot['hotel_id_x'].unique():
    hotel_data = monthly_payment_by_hotel_ot[monthly_payment_by_hotel_ot['hotel_id_x'] == hotel]
    for x, y in zip(hotel_data['Payment_date'], hotel_data['total_paid_OT']):
        plt.text(x, y, f'{y:.0f}', fontsize=9, ha='center', va='bottom')

plt.title('Distribución de Pago Mensual por Horas Extras por Hotel', fontsize=16)
plt.xlabel('Meses')
plt.ylabel('Total Pagado (€)')
plt.xticks(rotation=45)
plt.legend(title='Hotels ID')
plt.tight_layout()

plt.show()

# Preparando los datos para el gráfico
monthly_payment_ot = emp_wages_wfc.groupby(pd.Grouper(key='Payment_date', freq='M')).agg({'total_paid_OT': 'sum'}).reset_index()

# Creamos un gráfico de líneas
plt.figure(figsize=(12, 8))
sns.lineplot(
    data=monthly_payment_ot, 
    x='Payment_date',
    y='total_paid_OT',
    palette=['#3891A6'],
    marker='o',
    )

for hotel in monthly_payment_ot:
    hotel_data = monthly_payment_ot
    for x, y in zip(hotel_data['Payment_date'], hotel_data['total_paid_OT']):
        plt.text(x, y, f'{y:.0f}', fontsize=9, ha='center', va='bottom')

plt.title('Distribución Mensual del Pago Total de Horas Extras', fontsize=16)
plt.xlabel('Meses')
plt.ylabel('Total Pagado (€)')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

# Verificamos el pago total por departamento
emp_wages_wfc['Payment_month'] = emp_wages_wfc['Payment_date'].dt.strftime('%B')

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']


heatmap_pivot = emp_wages_wfc.pivot_table(index='Department', columns='Payment_month', values='Total_Payment', aggfunc='sum')
heatmap_pivot = heatmap_pivot.reindex(columns=month_order)
print(heatmap_pivot)

# Creamos un mapa de calor
plt.figure(figsize=(12, 8))
sns.color_palette("mako", as_cmap=True)
sns.heatmap(heatmap_pivot, annot=True, fmt=".0f", cbar_kws={'label': 'Total Abonado (€)'}, linewidth=.5)

plt.title('Pago Total Mensual por Departamento', fontsize=16)
plt.ylabel('Departamento')
plt.xlabel('Meses')
plt.xticks(rotation=45)
plt.show()

Payment_month               January     February        March        April  \
Department                                                                   
3R                       23788.0375   25218.5450   24951.7950   24210.7150   
Animation                75262.0575   76100.8650   76694.5050   76639.7000   
Bar_Restaurant          243748.3900  242044.1000  242306.9700  240518.0475   
Floors_Laundry          234890.1075  237591.0725  235397.6600  238962.1675   
Kitchen                 239475.5400  241929.8825  235661.5000  238551.6150   
Other                    56590.5275   56225.5650   52591.7025   57435.8825   
Reception_Reservations   82443.4525   81381.3025   82415.5650   85662.1550   
SPA                      39957.4525   39382.4850   39308.5225   40429.6000   
Technical_Services       80320.3650   77896.0925   76148.3950   78421.3475   

Payment_month                   May         June         July       August  \
Department                                                                   
3R                       24819.8750   24660.5525   24729.6650   23719.4100   
Animation                78262.2675   76263.5825   76848.9775   77691.9075   
Bar_Restaurant          244199.9250  239861.8425  240631.7800  239864.0250   
Floors_Laundry          238797.0250  237845.6975  240466.8800  239030.5525   
Kitchen                 240551.9975  241796.5075  240427.8375  241041.1200   
Other                    56404.0450   53803.7175   57132.7575   55454.1725   
Reception_Reservations   82340.1475   83753.6800   82323.1725   82068.7900   
SPA                      41678.9600   40275.8550   38823.7650   40522.9625   
Technical_Services       78530.9575   77630.7975   77634.9200   80792.2700   

Payment_month             September      October     November     December  
Department                                                                  
3R                       25311.1800   23944.4500   25051.4625   24647.7000  
Animation                76688.2000   76275.2225   75959.2450   77702.8200  
Bar_Restaurant          242982.3325  241844.7650  242111.2725  241070.7050  
Floors_Laundry          236468.5400  237566.8225  234701.4425  235476.7150  
Kitchen                 243247.1425  244014.4125  240852.4550  237833.5725  
Other                    55472.8450   56411.8050   52620.3175   56609.6850  
Reception_Reservations   82405.6225   82912.2050   82731.7850   82755.0650  
SPA                      40678.4050   39915.7425   40930.6050   40284.5850  
Technical_Services       78041.1075   79675.5575   79038.0250   77462.7450

# Salario promedio por hotel y departamento
avg_salary = emp_wages_wfc['Total_Payment'].mean().round(2)
avg_salary_by_hotel = emp_wages_wfc.groupby('hotel_id_x')['Total_Payment'].mean().round(2)
avg_salary_by_department = emp_wages_wfc.groupby('Department')['Total_Payment'].mean().round(2)


avg_salary_by_hotel.index = avg_salary_by_hotel.index.map(lambda x: f"{x} ({hotel_names[x]})")
avg_salary_by_hotel.index.name = None
avg_salary_by_department.index.name = None

print('El Salario Promedio Total es: €', avg_salary, '\n\n',
      'El Salario Promedio por Hotel es: ', '\n',avg_salary_by_hotel, '\n\n',
      'El Salario Promedio por Departamento es: ', '\n', avg_salary_by_department)

El Salario Promedio Total es: € 2134.15 

 El Salario Promedio por Hotel es:  
 ACECWR (Coral Wave Resort)     2108.51
FUESSP (Sandy Shores Park)     2162.40
TFNOBH (Ocean Breeze Haven)    2131.51
Name: Total_Payment, dtype: float64 

 El Salario Promedio por Departamento es:  
 3R                        2235.25
Animation                 2130.53
Bar_Restaurant            2066.37
Floors_Laundry            2063.18
Kitchen                   2072.83
Other                     3086.82
Reception_Reservations    2236.92
SPA                       2232.36
Technical_Services        2120.70
Name: Total_Payment, dtype: float64

# Creamos una conexion a MySQL
try:
    
    db = mysql.connector.connect(
        host = "localhost",
        user = "root",
        password = dbpass
    )
    print("Connection established")
    # Creamos un cursor para ejecutar las consultas
    cursor = db.cursor()
    
except mysql.connector.Error as err:
    print("An error occurred: ", err)

Connection established

# Creamos una conexión a la base de datos
hostname = "localhost"
database = "hrhotelpa"
username = "root"
password = dbpass

engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}".format(host=hostname, db=database, user=username, pw=password))

# Añadimos los datos a las tablas
# df_rawht.to_sql('Hotels', engine, if_exists='append', index=False)
# df_rawemp.to_sql('Employees', engine, if_exists='append', index=False)
# df_rawhtcomp.to_sql('Hotel_Composition', engine, if_exists='append', index=False)
# df_rawworkforce.to_sql('Workforce_Composition', engine, if_exists='append', index=False)
# df_rawempwages.to_sql('Employees_Wages', engine, if_exists='append', index=False)

cursor.close()
db.close()

	emp_wag_id	Price_$_Hour	Hours_worked	Work_overtime	Ovh$_75%	Gross_pay	Deductions_3%	Total_Payment	emp_id	hotel_id	hc_id	Payment_date
0	3272REFUESSP	14	129	4	10.50	1848.00	55.4400	1792.5600	3272	FUESSP	REFUESSP	2024-01-29
1	3074REFUESSP	14	143	3	10.50	2033.50	61.0050	1972.4950	3074	FUESSP	REFUESSP	2024-01-29
2	6627REFUESSP	18	135	4	13.50	2484.00	74.5200	2409.4800	6627	FUESSP	REFUESSP	2024-01-29
3	420REFUESSP	19	121	11	14.25	2455.75	73.6725	2382.0775	420	FUESSP	REFUESSP	2024-01-29
4	4856REFUESSP	14	132	7	10.50	1921.50	57.6450	1863.8550	4856	FUESSP	REFUESSP	2024-01-29

	emp_wag_id	Price_$_Hour	Hours_worked	Work_overtime	Ovh$_75	Gross_pay	Deductions_3	Total_Payment	emp_id	hotel_id_x	...	Payment_date	wkc_id	Department	Positions	years_at_position	Entry_date	years_working	Staff	hotel_id_y	hc_id_y
0	3272REFUESSP	14	129	4	10.50	1848.00	55.4400	1792.5600	3272	FUESSP	...	2024-01-29	3272FUESSP	Reception_Reservations	Staff	1	2023-09-26	1	0	FUESSP	REFUESSP
1	3074REFUESSP	14	143	3	10.50	2033.50	61.0050	1972.4950	3074	FUESSP	...	2024-01-29	3074FUESSP	Reception_Reservations	Staff	1	2023-04-29	1	0	FUESSP	REFUESSP
2	6627REFUESSP	18	135	4	13.50	2484.00	74.5200	2409.4800	6627	FUESSP	...	2024-01-29	6627FUESSP	Reception_Reservations	3rd_Command	4	2014-01-17	10	7	FUESSP	REFUESSP
3	420REFUESSP	19	121	11	14.25	2455.75	73.6725	2382.0775	420	FUESSP	...	2024-01-29	420FUESSP	Reception_Reservations	3rd_Command	3	2012-10-25	12	7	FUESSP	REFUESSP
4	4856REFUESSP	14	132	7	10.50	1921.50	57.6450	1863.8550	4856	FUESSP	...	2024-01-29	4856FUESSP	Reception_Reservations	Staff	1	2023-06-18	1	0	FUESSP	REFUESSP

	emp_wag_id	Price_$_Hour	Hours_worked	Work_overtime	Ovh$_75	Gross_pay	Deductions_3	Total_Payment	emp_id	hotel_id_x	hc_id_x	Payment_date	Department	Positions
0	3272REFUESSP	14	129	4	10.50	1848.00	55.4400	1792.5600	3272	FUESSP	REFUESSP	2024-01-29	Reception_Reservations	Staff
1	3074REFUESSP	14	143	3	10.50	2033.50	61.0050	1972.4950	3074	FUESSP	REFUESSP	2024-01-29	Reception_Reservations	Staff
2	6627REFUESSP	18	135	4	13.50	2484.00	74.5200	2409.4800	6627	FUESSP	REFUESSP	2024-01-29	Reception_Reservations	3rd_Command
3	420REFUESSP	19	121	11	14.25	2455.75	73.6725	2382.0775	420	FUESSP	REFUESSP	2024-01-29	Reception_Reservations	3rd_Command
4	4856REFUESSP	14	132	7	10.50	1921.50	57.6450	1863.8550	4856	FUESSP	REFUESSP	2024-01-29	Reception_Reservations	Staff

HOTEL HR PEOPLE ANALYTICS¶

Analizamos tres hoteles desde una perspectiva de People Analytics.¶

1. Importando las librerías¶

2. Trabajando con nuestras tablas¶

2.1 Tabla 'Employees'¶

2.2 Tabla 'Hotels'¶

2.3 Tabla 'Hotels_Composition'¶

2.4 Tabla 'Employees_Wages'¶

2.5 Tabla 'Workforce_Composition'¶

2.6 Testeando los 'dtypes'¶

2.7 Arreglando las 'tipos de datos' de las columnas¶

3.1 Dejemos que las visualizaciones nos muestren lo que los datos nos quieren contar.

3.1.a Es hora de visualizar los datos de género

3.1.b Analicemos la distribución por género por Departamentos y Hoteles

Es hora de visualizar nuestros datos de distribución de género por Hotel y Departamentos

Distribución del Género por Departamentos

3.2 Analicemos cómo se distribuyen nuestros empleados por edad

3.3 Analizamos la cantindad de empleados que están con licencia

4. Analizamos la tabla 'Employees Wages'

5. Carga de datos a MySQL

	emp_id	Name	Surname	Birthday	Age	Gender	hotel_id
0	3272	James	Smith	1957-08-09	67	M	FUESSP
1	3074	John	Johnson	1981-11-19	42	M	FUESSP
2	6627	Robert	Williams	1983-10-15	41	M	FUESSP
3	420	Michael	Brown	1976-04-05	48	M	FUESSP
4	4856	William	Jones	1968-11-20	55	M	FUESSP

	hotel_id	Name	Location	Opening	Stars	Budget
0	FUESSP	Sandy Shores Park	28 03 18.9N-14 19 21.4W	2001-03-05	4	350000000
1	TFNOBH	Ocean Breeze Haven	28 05 56.5N-16 44 54.6W	1998-10-05	5	550000000
2	ACECWR	Coral Wave Resort	28 51 25.9N-13 47 48.7 W	2000-05-05	5	480000000