Create A Bigram From A Column In Pandas Df

January 04, 2024 Post a Comment

i have this test table in pandas dataframe Leaf_category_id session_id product_id 0 111 1 987 3 111 4 987 4

Solution 1:

try this code

from itertools import combinations
import pandas as pd

df = pd.DataFrame.from_csv("data.csv")
#consecutive
grouped_consecutive_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair inzip(x,x[1:])]).reset_index()

df1=pd.DataFrame(grouped_consecutive_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_consecutive = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_consecutive["index"]

for combinations (all possible bi-grams)

from itertools import combinations
import pandas as pd

df = pd.DataFrame.from_csv("data.csv")
#combinations
grouped_combination_product_ids = df.groupby(['Leaf_category_id','session_id'])['product_id'].apply(lambda x: [tuple(sorted(pair)) for pair in combinations(x,2)]).reset_index()

df1=pd.DataFrame(grouped_combination_product_ids)
s=df1.product_id.apply(lambda x: pd.Series(x)).unstack()
df2=pd.DataFrame(s.reset_index(level=0,drop=True)).dropna()
df2.rename(columns = {0:'Bigram'}, inplace = True)
df2["freq"] = df2.groupby('Bigram')['Bigram'].transform('count')
bigram_frequency_combinations = df2.drop_duplicates(keep="first").sort_values("Bigram").reset_index()
del bigram_frequency_combinations["index"]

where data.csv contains

Leaf_category_id,session_id,product_id
0,111,1,111
3,111,4,987
4,111,1,741
1,222,2,654
2,333,3,321
5,111,1,87
6,111,1,34
7,111,1,12
8,111,1,987
9,111,4,1232
10,222,2,12
11,222,2,324
12,222,2,465
13,222,2,342
14,222,2,32
15,333,3,321
16,333,3,741
17,333,3,987
18,333,3,324
19,333,3,654
20,333,3,862
21,222,1,123
22,222,1,987
23,222,1,741
24,222,1,34
25,222,1,12

The resultant bigram_frequency_consecutive will be

         Bigram  freq
0      (12, 34)     21     (12, 324)     12     (12, 654)     13     (12, 987)     14     (32, 342)     15      (34, 87)     16     (34, 741)     17     (87, 741)     18    (111, 741)     19    (123, 987)     110   (321, 321)     111   (321, 741)     112   (324, 465)     113   (324, 654)     114   (324, 987)     115   (342, 465)     116   (654, 862)     117   (741, 987)     218  (987, 1232)     1

The resultant bigram_frequency_combinations will be

Baca Juga

           Bigram  freq
0      (12, 32)     11      (12, 34)     22      (12, 87)     13     (12, 111)     14     (12, 123)     15     (12, 324)     16     (12, 342)     17     (12, 465)     18     (12, 654)     19     (12, 741)     210    (12, 987)     211    (32, 324)     112    (32, 342)     113    (32, 465)     114    (32, 654)     115     (34, 87)     116    (34, 111)     117    (34, 123)     118    (34, 741)     219    (34, 987)     220    (87, 111)     121    (87, 741)     122    (87, 987)     123   (111, 741)     124   (111, 987)     125   (123, 741)     126   (123, 987)     127   (321, 321)     128   (321, 324)     229   (321, 654)     230   (321, 741)     231   (321, 862)     232   (321, 987)     233   (324, 342)     134   (324, 465)     135   (324, 654)     236   (324, 741)     137   (324, 862)     138   (324, 987)     139   (342, 465)     140   (342, 654)     141   (465, 654)     142   (654, 741)     143   (654, 862)     144   (654, 987)     145   (741, 862)     146   (741, 987)     347   (862, 987)     148  (987, 1232)     1

in the above case it groups by both

Solution 2:

We are going to pull out the values from product_id, create bigrams that are sorted and thus deduplicated, and count them to get the frequency, and then populate a data frame.

from collections import Counter

# assuming your data frame is called 'df'

bigrams = [list(zip(x,x[1:])) for x in df.product_id.values.tolist()]
bigram_set = [tuple(sorted(xx) for x in bigrams for xx in x]
freq_dict = Counter(bigram_set)
df_freq = pd.DataFrame([list(f) for f in freq_dict], columns=['bigram','freq'])

Python Playground

Create A Bigram From A Column In Pandas Df

Solution 1:

Solution 2:

Post a Comment for "Create A Bigram From A Column In Pandas Df"