import pandas as pd
import numpy as np
import random
import string
from faker import Faker
from datetime import datetime

fake = Faker()


# Let's create a list with the names and other list with their surnames
names = [
    'John', 'Emily', 'Michael', 'Sarah', 'David', 'Jessica', 'Daniel', 'Olivia', 'Christopher', 'Sophia',
    'Matthew', 'Ava', 'Andrew', 'Emma', 'James', 'Isabella', 'Joseph', 'Mia', 'Anthony', 'Abigail',
    'Robert', 'Charlotte', 'William', 'Amelia', 'David', 'Evelyn', 'Alexander', 'Harper', 'Thomas', 'Grace',
    'Ethan', 'Chloe', 'Steven', 'Ella', 'Richard', 'Victoria', 'Ryan', 'Avery', 'Charles', 'Sofia',
    'Edward', 'Scarlett', 'George', 'Madison', 'Samuel', 'Lily', 'Jackson', 'Aria', 'Henry', 'Lillian',
    'Owen', 'Addison', 'Sebastian', 'Aubrey', 'Benjamin', 'Natalie', 'Nicholas', 'Zoey', 'Daniel', 'Hannah',
    'Aiden', 'Brooklyn', 'Adam', 'Stella', 'Dylan', 'Nora', 'Wyatt', 'Leah', 'Caleb', 'Audrey',
    'Nathan', 'Claire', 'Jayden', 'Ellie', 'Zachary', 'Savannah', 'Carter', 'Skylar', 'Luke', 'Riley',
    'Isaac', 'Aaliyah', 'Christian', 'Penelope', 'Liam', 'Camila', 'Julian', 'Ariana', 'Gabriel', 'Kennedy',
    'Levi', 'Maya', 'Jaxon', 'Gabriella', 'Aaron', 'Madelyn', 'Brayden', 'Elena', 'Oliver', 'Anna'
]

surnames = [
    'Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor',
    'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson',
    'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'Hernandez', 'King',
    'Wright', 'Lopez', 'Hill', 'Scott', 'Green', 'Adams', 'Baker', 'Gonzalez', 'Nelson', 'Carter',
    'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips', 'Campbell', 'Parker', 'Evans', 'Edwards', 'Collins',
    'Stewart', 'Sanchez', 'Morris', 'Rogers', 'Reed', 'Cook', 'Morgan', 'Bell', 'Murphy', 'Bailey',
    'Rivera', 'Cooper', 'Richardson', 'Cox', 'Howard', 'Ward', 'Torres', 'Peterson', 'Gray', 'Ramirez',
    'James', 'Watson', 'Brooks', 'Kelly', 'Sanders', 'Price', 'Bennett', 'Wood', 'Barnes', 'Ross',
    'Henderson', 'Coleman', 'Jenkins', 'Perry', 'Powell', 'Long', 'Patterson', 'Hughes', 'Flores', 'Washington',
    'Butler', 'Simmons', 'Foster', 'Gonzales', 'Bryant', 'Alexander', 'Russell', 'Griffin', 'Diaz', 'Hayes'
]

sex = [
    'male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female'
]


# Create our data frame
colacho_df = pd.DataFrame({
    'Name': names,
    'Surname': surnames,
    'Gender': sex
})

# Checking if everything is ok
print(colacho_df.head())

      Name   Surname  Gender
0     John     Smith    male
1    Emily   Johnson  female
2  Michael  Williams    male
3    Sarah     Jones  female
4    David     Brown    male


# Random values to replace later
work_department = np.random.randint(1, 6, size=100)
colacho_df['WorkDepartment'] = work_department

# Its time to replace the values with the areas where people are from
department_map = {
    1: 'HHRR',
    2: 'Sales',
    3: 'Account',
    4: 'Marketing',
    5: 'Operations'
}

colacho_df['WorkDepartment'] = colacho_df['WorkDepartment'].map(department_map)

# Let's see if everthing is OK
print(colacho_df.head(10))

          Name   Surname  Gender WorkDepartment
0         John     Smith    male        Account
1        Emily   Johnson  female        Account
2      Michael  Williams    male          Sales
3        Sarah     Jones  female           HHRR
4        David     Brown    male     Operations
5      Jessica     Davis  female           HHRR
6       Daniel    Miller    male          Sales
7       Olivia    Wilson  female          Sales
8  Christopher     Moore    male      Marketing
9       Sophia    Taylor  female           HHRR


# Let's create to function to get the Id
def generate_id():
    letters = string.ascii_letters.upper()
    numbers = string.digits
    random_letters = ''.join(random.choice(letters) for i in range(2))
    random_numbers = ''.join(random.choice(numbers) for i in range(4))
    return random_letters + random_numbers

# Our ID column with their random values
colacho_df['ID'] = colacho_df.apply(lambda row: generate_id(), axis=1)

# Checking if works fine
# print(colacho_df.head(10))
colacho_df


# Generate the fake address for the employees
Faker.seed(0)
address = [fake.street_address() for i in range(100)]
print(address)

colacho_df['Address'] = address
colacho_df['Address']

['0487 Hull Village Suite 759', '242 Christine Glen', '1157 Michael Island', '778 Brown Plaza', '60975 Jessica Squares', '93328 Davis Island', '48418 Olsen Plains Apt. 989', '1965 Kelly Field Apt. 094', '12201 Massey Pine Suite 833', '477 Miller Ridge Apt. 795', '04135 Marvin Via', '23098 Anthony Roads', '916 Mitchell Crescent', '032 Timothy Stream', '086 Mary Cliff', '45620 Williams Courts', '45792 Tammy Centers Apt. 258', '97207 Mccullough Well Suite 564', '071 Stevenson Plains', '37594 Garza Roads Apt. 466', '93523 Dana Lane', '69602 Brown Squares Apt. 787', '75470 Lopez Roads', '206 Stewart Forest', '089 James Rest', '4217 Burton Brooks', '1428 Wilson Drives Suite 000', '485 Mays Vista', '977 Watson Plain', '69402 Joseph Junction', '159 Blake Estates Apt. 945', '241 Cross Causeway Suite 281', '5461 Ann Orchard Suite 551', '76045 Samantha Road Suite 111', '306 Corey Point', '7936 Micheal Green Apt. 635', '08731 Sanders Fords', '92137 Ward Views', '97296 Rich Park', '7389 Alec Squares Suite 508', '9269 Robbins Valley', '18013 Billy Bridge Suite 522', '8688 Audrey Springs Apt. 634', '6993 Diane Alley Apt. 489', '1744 Cruz Lights Apt. 223', '500 Butler Overpass Apt. 256', '767 Craig Tunnel', '09925 Isabel Run', '67109 Mendez Junction Apt. 942', '1830 Wallace Throughway Apt. 751', '74089 Jerry Trail', '412 Snow Manors Apt. 161', '2076 Johnson Way', '415 David Square Apt. 552', '51923 Jamie Spring', '045 Sarah Fort', '71754 Anthony Fords', '7448 White Common Apt. 894', '2235 Joel Ferry', '6029 Phillip Squares Apt. 264', '296 Walsh Corner Apt. 758', '6065 Harris Hill', '21418 Laura Mission Suite 266', '75825 Welch Corners', '40797 Jeffery Crescent Suite 892', '717 Amy Lodge Suite 014', '46896 Garcia Glen', '3910 Laura Inlet Suite 183', '64482 Amanda Loop', '447 Carroll Dam Apt. 116', '728 Gomez Mountains Suite 377', '0742 Williams Road Apt. 057', '81160 Williams River Apt. 443', '431 Murray Isle', '56627 Alexandria Curve Apt. 275', '7633 Bentley Radial Apt. 603', '50239 Salazar Squares', '6708 Carpenter Overpass Suite 735', '00250 Pena Dam Apt. 639', '373 Franklin Rest Apt. 558', '4080 Miguel Fords Suite 334', '8647 Wiggins Garden Apt. 481', '7706 Stevens Crest', '31263 Salazar Viaduct', '602 Tracy Crossroad Suite 556', '498 Jennifer Tunnel', '978 Nelson Brook Apt. 912', '34088 Trevino Crossing Suite 419', '84331 Leonard Fort Suite 749', '08855 Lisa Wells', '28123 Hudson Square Apt. 323', '9927 Christina Burg Suite 774', '82778 Padilla Common', '96354 Acevedo Fords Apt. 535', '345 Kevin Knolls Apt. 250', '003 Alexander Shoal Suite 105', '37625 Thompson Isle Suite 606', '870 Robert Loaf Apt. 082', '53230 Julia Villages', '169 Christine Mount']

0       0487 Hull Village Suite 759
1                242 Christine Glen
2               1157 Michael Island
3                   778 Brown Plaza
4             60975 Jessica Squares
                  ...              
95    003 Alexander Shoal Suite 105
96    37625 Thompson Isle Suite 606
97         870 Robert Loaf Apt. 082
98             53230 Julia Villages
99              169 Christine Mount
Name: Address, Length: 100, dtype: object


# Let's put the values into a variable
birthday = [fake.date_of_birth(minimum_age=20, maximum_age=68) for i in range(100)]
print(birthday)

colacho_df['Birthday'] = birthday
colacho_df

[datetime.date(2003, 4, 8), datetime.date(1965, 2, 27), datetime.date(1996, 10, 16), datetime.date(1995, 11, 29), datetime.date(1957, 9, 2), datetime.date(1972, 2, 16), datetime.date(1978, 2, 5), datetime.date(1980, 7, 26), datetime.date(1956, 10, 19), datetime.date(1997, 7, 12), datetime.date(1957, 6, 1), datetime.date(1988, 9, 20), datetime.date(1961, 1, 16), datetime.date(1979, 5, 2), datetime.date(1974, 11, 12), datetime.date(2000, 8, 25), datetime.date(1965, 3, 17), datetime.date(1986, 2, 13), datetime.date(1971, 1, 6), datetime.date(1989, 6, 23), datetime.date(1979, 2, 27), datetime.date(1965, 12, 27), datetime.date(1982, 6, 27), datetime.date(1977, 12, 3), datetime.date(1973, 4, 22), datetime.date(1988, 6, 12), datetime.date(1981, 8, 29), datetime.date(1955, 12, 14), datetime.date(1976, 2, 4), datetime.date(1991, 1, 13), datetime.date(1974, 7, 23), datetime.date(1992, 5, 27), datetime.date(1986, 11, 9), datetime.date(1957, 4, 28), datetime.date(1991, 2, 18), datetime.date(1993, 10, 7), datetime.date(1992, 7, 7), datetime.date(1972, 10, 21), datetime.date(2001, 9, 7), datetime.date(1957, 7, 20), datetime.date(1985, 12, 20), datetime.date(1981, 10, 30), datetime.date(1963, 2, 11), datetime.date(1982, 6, 3), datetime.date(1978, 7, 13), datetime.date(1988, 9, 12), datetime.date(1958, 6, 5), datetime.date(1956, 5, 9), datetime.date(1973, 7, 11), datetime.date(1957, 4, 11), datetime.date(1972, 3, 23), datetime.date(2001, 4, 8), datetime.date(2001, 3, 31), datetime.date(1994, 7, 20), datetime.date(2002, 10, 6), datetime.date(1974, 8, 31), datetime.date(2001, 8, 20), datetime.date(1969, 1, 14), datetime.date(1990, 12, 18), datetime.date(1990, 1, 28), datetime.date(1978, 2, 6), datetime.date(1981, 3, 20), datetime.date(1972, 1, 1), datetime.date(1969, 3, 1), datetime.date(1962, 11, 3), datetime.date(1993, 6, 18), datetime.date(1977, 5, 13), datetime.date(1971, 6, 17), datetime.date(1994, 10, 31), datetime.date(2000, 10, 19), datetime.date(1991, 3, 15), datetime.date(2001, 4, 23), datetime.date(1978, 12, 28), datetime.date(1966, 1, 3), datetime.date(1965, 5, 12), datetime.date(1977, 6, 12), datetime.date(1955, 7, 25), datetime.date(1994, 10, 1), datetime.date(1958, 6, 7), datetime.date(1993, 5, 20), datetime.date(1965, 7, 28), datetime.date(1978, 5, 24), datetime.date(1979, 8, 30), datetime.date(1974, 10, 5), datetime.date(1997, 7, 11), datetime.date(1974, 11, 30), datetime.date(1976, 12, 4), datetime.date(1988, 8, 14), datetime.date(1982, 5, 31), datetime.date(1995, 10, 28), datetime.date(1984, 4, 10), datetime.date(1966, 7, 8), datetime.date(1955, 1, 6), datetime.date(1964, 7, 10), datetime.date(1993, 8, 21), datetime.date(1957, 11, 29), datetime.date(1985, 1, 6), datetime.date(1963, 7, 5), datetime.date(1978, 3, 5), datetime.date(1955, 7, 31)]


# Convert 'birthday' column to datetime
colacho_df['Birthday'] = pd.to_datetime(colacho_df['Birthday'])

# Calculate the age based on today's date
today = datetime.now()
# age = np.random.randint(20, 68, size=100)
colacho_df['Age'] = today.year - colacho_df['Birthday'].dt.year - ((today.month < colacho_df['Birthday'].dt.month) | ((today.month == colacho_df['Birthday'].dt.month) & (today.day < colacho_df['Birthday'].dt.day)))

# Checking if everything is OK
colacho_df


# Before saving our data, let's check our data
colacho_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Name            100 non-null    object        
 1   Surname         100 non-null    object        
 2   Gender          100 non-null    object        
 3   WorkDepartment  100 non-null    object        
 4   ID              100 non-null    object        
 5   Address         100 non-null    object        
 6   Birthday        100 non-null    datetime64[ns]
 7   Age             100 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 6.4+ KB


# Checking for missing values
colacho_df.isna()


# Let's save the data in a .csv file
colacho_df.to_csv('colacho_database.csv', index=False)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Let's import our data frame
cdf_rawdata = pd.read_csv('colacho_database.csv')
cdf_rawdata


# Create the new data frame in a new variable
cdf_salary = cdf_rawdata[['ID', 'Name', 'Surname', 'WorkDepartment']].copy()

# Checking if the data frame has the information needed
cdf_salary


# Salaries
salary = np.random.randint(18000, 75000, size = 100)
cdf_salary['Salary'] = salary

cdf_salary


# Create the random number generator
exp_years = np.random.uniform(0, 8, size=100)

# Convert the years of experience into months
exp_months = exp_years * 12

# Round our numbers into one decimal
exp_months_rounded = np.round(exp_months)

# Put the values into the column 'Experience'
cdf_salary['Experience'] = exp_months_rounded

# Check our data frame
cdf_salary


# Basic statistics from 'Salary' and 'Experience' columns
cdf_salary[['Salary', 'Experience']].describe()


# Visualizing our data with a scatterplot
x = cdf_salary['Experience'];
y = cdf_salary['Salary'];
plt.scatter(x = x, y = y, color='#9467bd')
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[ 1.         -0.02052559]
 [-0.02052559  1.        ]]


import statsmodels.formula.api as smf
model = smf.ols('Salary ~ Experience', data = cdf_salary).fit()
model.summary()


# Let's predict our salaries for each experience years
pred1 = model.predict(pd.DataFrame(cdf_salary['Experience']))
print(pred1)

0     47739.737776
1     47500.314019
2     47340.698180
3     47154.479702
4     47912.654934
          ...     
95    47487.012699
96    47633.327217
97    47540.217978
98    47367.300820
99    47287.492901
Length: 100, dtype: float64


# Regression Line
plt.scatter(x, y)
plt.plot(x, pred1, 'r')
plt.legend(['Predicted Line', 'Observed Data'])
plt.show()


# Error Calculation
res1 = y - pred1
res_sqr1 = res1 * res1
mse1 = np.mean(res_sqr1)
rmse1 = np.sqrt(mse1)
print(rmse1)

17035.522328550174


# Transformed data
# Log Transformation
plt.scatter(x = np.log(x), y = x)
np.corrcoef(np.log(x), y)
model2 = smf.ols('Salary ~ Experience', data = cdf_salary)


# Counting employees per department
cdf_salary_count = cdf_salary['WorkDepartment'].value_counts()
cdf_salary_count

HHRR          24
Marketing     24
Operations    22
Sales         17
Account       13
Name: WorkDepartment, dtype: int64


# Creating a plot to visualize the data
explode = [0.1] * len(cdf_salary_count)
cdf_salary['WorkDepartment'].value_counts().plot(
    kind='pie',
    explode=explode,
    title='Employees per department',
    autopct='%1.1f%%',
    shadow=True,
    startangle=90,
    ylabel=''
)

<Axes: title={'center': 'Employees per department'}>


cdf_salary.sort_values(['Experience', 'Salary'], ascending=False)


# Creating a dictionary to store the data from 'WorkDepartment'
grouped_departments = {}

# Group the data by each department, we also need to copy the data
for department, data in cdf_salary.groupby('WorkDepartment'):
    grouped_departments[department] = data.copy()
    
# Checking if the data is ok
print(grouped_departments)

{'Account':         ID      Name   Surname WorkDepartment  Salary  Experience
0   SZ2174      John     Smith        Account   58694        28.0
1   ZG8243     Emily   Johnson        Account   68734        46.0
19  RH0053   Abigail  Robinson        Account   20378        96.0
24  VI4934     David    Walker        Account   44257        35.0
53  FQ5081    Aubrey    Rogers        Account   24076        59.0
56  ET9072  Nicholas    Morgan        Account   23835         8.0
57  BH7650      Zoey      Bell        Account   19824        44.0
60  UM4299     Aiden    Rivera        Account   33860        80.0
61  CC6460  Brooklyn    Cooper        Account   42127        34.0
64  AM0081     Dylan    Howard        Account   30985        61.0
70  LD5111    Nathan     James        Account   72621        61.0
94  ID3572     Aaron    Bryant        Account   52413        29.0
96  NU0471   Brayden   Russell        Account   63372        36.0, 'HHRR':         ID      Name     Surname WorkDepartment  Salary  Experience
3   ZG2248     Sarah       Jones           HHRR   22344        72.0
5   BB3609   Jessica       Davis           HHRR   59021        71.0
9   PH5293    Sophia      Taylor           HHRR   73316        64.0
13  MZ3212      Emma       White           HHRR   42158         3.0
15  CF9510  Isabella      Martin           HHRR   36044        82.0
17  OE3003       Mia      Garcia           HHRR   41203         9.0
20  GL5268    Robert       Clark           HHRR   65214        37.0
25  SL6383    Evelyn        Hall           HHRR   57049        52.0
29  TG4843     Grace        King           HHRR   40898        19.0
30  QA5493     Ethan      Wright           HHRR   48410        67.0
31  JZ5680     Chloe       Lopez           HHRR   66649        32.0
32  ZR8804    Steven        Hill           HHRR   21436        34.0
43  WL5614   Madison      Turner           HHRR   20503        90.0
49  IT0440   Lillian     Collins           HHRR   73589        45.0
51  TZ8429   Addison     Sanchez           HHRR   58526        53.0
54  AB2695  Benjamin        Reed           HHRR   72186        86.0
62  VI0208      Adam  Richardson           HHRR   47708        28.0
68  AK0475     Caleb        Gray           HHRR   37027        38.0
74  LX8932   Zachary     Sanders           HHRR   19656        35.0
78  IO9348      Luke      Barnes           HHRR   41401        45.0
86  BD4756    Julian   Patterson           HHRR   56282        49.0
88  LO3156   Gabriel      Flores           HHRR   22806        91.0
98  QH3190    Oliver        Diaz           HHRR   54154        56.0
99  JM2926      Anna       Hayes           HHRR   53523        62.0, 'Marketing':         ID         Name    Surname WorkDepartment  Salary  Experience
8   KP5874  Christopher      Moore      Marketing   57277        10.0
10  AA4082      Matthew   Anderson      Marketing   36958        22.0
11  JL5409          Ava     Thomas      Marketing   37018        63.0
28  EL3126       Thomas  Hernandez      Marketing   60532        22.0
33  IO2124         Ella      Scott      Marketing   70426        58.0
36  TT6646         Ryan      Baker      Marketing   22786        13.0
40  KR3331       Edward   Mitchell      Marketing   66309        91.0
42  TR6921       George    Roberts      Marketing   53399        85.0
44  YQ4003       Samuel   Phillips      Marketing   32291        73.0
45  XT7996         Lily   Campbell      Marketing   67620        74.0
46  PK4672      Jackson     Parker      Marketing   62556        22.0
50  YN0851         Owen    Stewart      Marketing   60790        84.0
52  XQ0649    Sebastian     Morris      Marketing   56539        43.0
55  EL9384      Natalie       Cook      Marketing   43830        74.0
58  UG0788       Daniel     Murphy      Marketing   21697        92.0
59  OD1273       Hannah     Bailey      Marketing   43673        26.0
65  PA5532         Nora       Ward      Marketing   67424        12.0
69  SH1351       Audrey    Ramirez      Marketing   22675        24.0
71  TE5901       Claire     Watson      Marketing   51629        83.0
83  HV9147     Penelope      Perry      Marketing   42683        56.0
84  EO8653         Liam     Powell      Marketing   28088        81.0
90  VP4724         Levi     Butler      Marketing   66137        83.0
91  PL0038         Maya    Simmons      Marketing   43512        40.0
92  WV2546        Jaxon     Foster      Marketing   58383        87.0, 'Operations':         ID       Name     Surname WorkDepartment  Salary  Experience
4   JE3473      David       Brown     Operations   21531        15.0
14  HI1614      James      Harris     Operations   31624        77.0
18  LB3872    Anthony    Martinez     Operations   53429        74.0
21  FW2732  Charlotte   Rodriguez     Operations   70769        13.0
23  JW8178     Amelia         Lee     Operations   54090        23.0
26  VH1780  Alexander       Allen     Operations   71166        58.0
27  GF2719     Harper       Young     Operations   50153        11.0
38  VY9916    Charles      Nelson     Operations   40021        72.0
41  BZ1996   Scarlett       Perez     Operations   55770        38.0
47  AM7483       Aria       Evans     Operations   49301        33.0
63  JN5295     Stella         Cox     Operations   63938        79.0
66  BV8467      Wyatt      Torres     Operations   26346        53.0
72  WL2565     Jayden      Brooks     Operations   57930        62.0
73  XQ3462      Ellie       Kelly     Operations   32464        90.0
75  BV1209   Savannah       Price     Operations   20713         6.0
76  NA2508     Carter     Bennett     Operations   48031        59.0
77  IW6082     Skylar        Wood     Operations   69755        83.0
85  MU4083     Camila        Long     Operations   48793        39.0
87  WG6354     Ariana      Hughes     Operations   60065        20.0
89  NW0984    Kennedy  Washington     Operations   26405        81.0
95  SU8337    Madelyn   Alexander     Operations   19536        47.0
97  HT8585      Elena     Griffin     Operations   49178        43.0, 'Sales':         ID       Name    Surname WorkDepartment  Salary  Experience
2   LB2444    Michael   Williams          Sales   72195        58.0
6   GD4300     Daniel     Miller          Sales   64451        11.0
7   NK9946     Olivia     Wilson          Sales   66589        49.0
12  EF3588     Andrew    Jackson          Sales   26255         5.0
16  QT6110     Joseph   Thompson          Sales   55506        84.0
22  ME7089    William      Lewis          Sales   67646        37.0
34  ER4294    Richard      Green          Sales   70584        79.0
35  WR3782   Victoria      Adams          Sales   48703        41.0
37  MW3541      Avery   Gonzalez          Sales   59561        17.0
39  BU9606      Sofia     Carter          Sales   58297        50.0
48  FY3880      Henry    Edwards          Sales   29514        87.0
67  HX0309       Leah   Peterson          Sales   18183        45.0
79  VC2960      Riley       Ross          Sales   72681        31.0
80  FV7585      Isaac  Henderson          Sales   27565         8.0
81  YB9747    Aaliyah    Coleman          Sales   34310        85.0
82  SB3766  Christian    Jenkins          Sales   60564        47.0
93  JI1684  Gabriella   Gonzales          Sales   29843        91.0}


# Now we can create our data frame for each department by accessing our dictionary
sales_df = grouped_departments['Sales']
account_df = grouped_departments['Account']
hhrr_df = grouped_departments['HHRR']
operations_df = grouped_departments['Operations']
marketing_df = grouped_departments['Marketing']
sales_df


# Let's count the employees per department using a function
def countEmployees(df):
    return df.shape[0]

sales_count = countEmployees(sales_df)
account_count = countEmployees(account_df)
hhrr_count = countEmployees(hhrr_df)
operations_count = countEmployees(operations_df)
marketing_count = countEmployees(marketing_df)

print(
    "Number of employees in Sales: ", sales_count, "\n",
    "Number of employees in HHRR: ", hhrr_count, "\n",
    "Number of employees in Account: ", account_count, "\n",
    "Number of employees in Marketing: ", marketing_count, "\n",
    "Number of employees in Operations: ", operations_count,
    )

Number of employees in Sales:  17 
 Number of employees in HHRR:  24 
 Number of employees in Account:  13 
 Number of employees in Marketing:  24 
 Number of employees in Operations:  22


# Using a scatter plot to visualize the correlation between Salary and Experience
x = sales_df['Experience']
y = sales_df['Salary']
color = '#9ba64b'
plot = plt.scatter(x = x, y = y, color=color)
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.suptitle('Sales Department', style='italic', fontsize=14, color='red')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[ 1.         -0.06433463]
 [-0.06433463  1.        ]]


# Using a scatter plot to visualize the correlation between Salary and Experience
x = hhrr_df['Experience']
y = hhrr_df['Salary']
color = '#9ba64b'
plot = plt.scatter(x = x, y = y, color=color)
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.suptitle('HHRR Department', style='italic', fontsize=14, color='red')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[ 1.         -0.03556674]
 [-0.03556674  1.        ]]


x = account_df['Experience']
y = account_df['Salary']
color = '#9ba64b'
plot = plt.scatter(x = x, y = y, color=color)
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.suptitle('Account Department', style='italic', fontsize=14, color='red')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[ 1.         -0.24160341]
 [-0.24160341  1.        ]]


x = marketing_df['Experience']
y = marketing_df['Salary']
color = '#9ba64b'
plot = plt.scatter(x = x, y = y, color=color)
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.suptitle('Marketing Department', style='italic', fontsize=14, color='red')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[1.         0.07352061]
 [0.07352061 1.        ]]


x = operations_df['Experience']
y = operations_df['Salary']
color = '#9ba64b'
plot = plt.scatter(x = x, y = y, color=color)
#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(x, y, 1)
lreg = np.corrcoef(x, y)
plt.plot(x, m*x+b, color='red')
# Labels and title
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.suptitle('Operations Department', style='italic', fontsize=14, color='red')
plt.title('Correlation between Salary and Experience ')
print(lreg)

[[1.         0.02472314]
 [0.02472314 1.        ]]

	Salary	Experience
count	100.000000	100.000000
mean	47439.660000	50.560000
std	17124.951892	26.425928
min	18183.000000	3.000000
25%	32124.250000	30.500000
50%	49239.500000	49.000000
75%	60620.500000	74.000000
max	73589.000000	96.000000

Dep. Variable:	Salary	R-squared:	0.000
Model:	OLS	Adj. R-squared:	-0.010
Method:	Least Squares	F-statistic:	0.04130
Date:	Wed, 13 Dec 2023	Prob (F-statistic):	0.839
Time:	14:42:02	Log-Likelihood:	-1116.2
No. Observations:	100	AIC:	2236.
Df Residuals:	98	BIC:	2242.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	4.811e+04	3729.751	12.900	0.000	4.07e+04	5.55e+04
Experience	-13.3013	65.448	-0.203	0.839	-143.180	116.578

Omnibus:	41.015	Durbin-Watson:	1.868
Prob(Omnibus):	0.000	Jarque-Bera (JB):	7.046
Skew:	-0.198	Prob(JB):	0.0295
Kurtosis:	1.762	Cond. No.	124.

People Analytic Project - Colacho Company, Inc¶

1. Import Libraries¶

2. Create the data frame¶

2.1 Create new columns¶

2.1.a Work Departments Column¶

2.1.b ID column¶

2.1.c Address Column¶

2.1.d Birthday Column¶

2.1.e Age column¶

2.1.f Saving our dataset¶

People Analytic Project - Colacho Company, Inc (Part 2)¶

Salary Database¶

3. Import libraries¶

4. Import our database¶

4.1 Salaries data frame¶

4.2 Create the new columns¶

4.2.a Salary¶

4.2.b Experience¶

4.3 Some numbers¶

4.3.a Correlation between Salary and Experience¶

4.3.b Import the statmodels formula¶

4.3.c Creating our prediction model¶

4.4 Separating data¶

4.5 Correlation between Salary and Experience per department¶

	Name	Surname	Gender	WorkDepartment	ID
0	John	Smith	male	Account	SZ2174
1	Emily	Johnson	female	Account	ZG8243
2	Michael	Williams	male	Sales	LB2444
3	Sarah	Jones	female	HHRR	ZG2248
4	David	Brown	male	Operations	JE3473
...	...	...	...	...	...
95	Madelyn	Alexander	female	Operations	SU8337
96	Brayden	Russell	male	Account	NU0471
97	Elena	Griffin	female	Operations	HT8585
98	Oliver	Diaz	male	HHRR	QH3190
99	Anna	Hayes	female	HHRR	JM2926

	Name	Surname	Gender	WorkDepartment	ID	Address	Birthday	Age
0	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...
95	False	False	False	False	False	False	False	False
96	False	False	False	False	False	False	False	False
97	False	False	False	False	False	False	False	False
98	False	False	False	False	False	False	False	False
99	False	False	False	False	False	False	False	False

	ID	Name	Surname	WorkDepartment	Salary	Experience
19	RH0053	Abigail	Robinson	Account	20378	96.0
58	UG0788	Daniel	Murphy	Marketing	21697	92.0
40	KR3331	Edward	Mitchell	Marketing	66309	91.0
93	JI1684	Gabriella	Gonzales	Sales	29843	91.0
88	LO3156	Gabriel	Flores	HHRR	22806	91.0
...	...	...	...	...	...	...
80	FV7585	Isaac	Henderson	Sales	27565	8.0
56	ET9072	Nicholas	Morgan	Account	23835	8.0
75	BV1209	Savannah	Price	Operations	20713	6.0
12	EF3588	Andrew	Jackson	Sales	26255	5.0
13	MZ3212	Emma	White	HHRR	42158	3.0