Create A Bigram From A Column In Pandas Df
i have this test table in pandas dataframe Leaf_category_id session_id product_id 0 111 1 987 3 111 4 987 4
Solution 1:
try this code
from itertools import combinations
import pandas as pd
df = pd.DataFrame.from_csv("data.csv")
#consecutive
grouped_consecutive_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair inzip(x,x[1:])]).reset_index()
df1=pd.DataFrame(grouped_consecutive_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_consecutive = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_consecutive["index"]
for combinations (all possible bi-grams)
from itertools import combinations
import pandas as pd
df = pd.DataFrame.from_csv("data.csv")
#combinations
grouped_combination_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in combinations(x,2)]).reset_index()
df1=pd.DataFrame(grouped_combination_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_combinations = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_combinations["index"]
where data.csv
contains
Leaf_category_id,session_id,product_id
0,111,1,111
3,111,4,987
4,111,1,741
1,222,2,654
2,333,3,321
5,111,1,87
6,111,1,34
7,111,1,12
8,111,1,987
9,111,4,1232
10,222,2,12
11,222,2,324
12,222,2,465
13,222,2,342
14,222,2,32
15,333,3,321
16,333,3,741
17,333,3,987
18,333,3,324
19,333,3,654
20,333,3,862
21,222,1,123
22,222,1,987
23,222,1,741
24,222,1,34
25,222,1,12
The resultant bigram_frequency_consecutive
will be
Bigram freq
0 (12, 34) 21 (12, 324) 12 (12, 654) 13 (12, 987) 14 (32, 342) 15 (34, 87) 16 (34, 741) 17 (87, 741) 18 (111, 741) 19 (123, 987) 110 (321, 321) 111 (321, 741) 112 (324, 465) 113 (324, 654) 114 (324, 987) 115 (342, 465) 116 (654, 862) 117 (741, 987) 218 (987, 1232) 1
The resultant bigram_frequency_combinations
will be
Bigram freq
0 (12, 32) 11 (12, 34) 22 (12, 87) 13 (12, 111) 14 (12, 123) 15 (12, 324) 16 (12, 342) 17 (12, 465) 18 (12, 654) 19 (12, 741) 210 (12, 987) 211 (32, 324) 112 (32, 342) 113 (32, 465) 114 (32, 654) 115 (34, 87) 116 (34, 111) 117 (34, 123) 118 (34, 741) 219 (34, 987) 220 (87, 111) 121 (87, 741) 122 (87, 987) 123 (111, 741) 124 (111, 987) 125 (123, 741) 126 (123, 987) 127 (321, 321) 128 (321, 324) 229 (321, 654) 230 (321, 741) 231 (321, 862) 232 (321, 987) 233 (324, 342) 134 (324, 465) 135 (324, 654) 236 (324, 741) 137 (324, 862) 138 (324, 987) 139 (342, 465) 140 (342, 654) 141 (465, 654) 142 (654, 741) 143 (654, 862) 144 (654, 987) 145 (741, 862) 146 (741, 987) 347 (862, 987) 148 (987, 1232) 1
in the above case it groups by both
Solution 2:
We are going to pull out the values from product_id
, create bigrams
that are sorted and thus deduplicated, and count them to get the frequency, and then populate a data frame.
from collections import Counter
# assuming your data frame is called 'df'
bigrams = [list(zip(x,x[1:])) for x in df.product_id.values.tolist()]
bigram_set = [tuple(sorted(xx) for x in bigrams for xx in x]
freq_dict = Counter(bigram_set)
df_freq = pd.DataFrame([list(f) for f in freq_dict], columns=['bigram','freq'])
Post a Comment for "Create A Bigram From A Column In Pandas Df"