23. wordcount output
CREATE TABLE producer (word STRING , freq INT) PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't' STORED AS TEXTFILE ;
SHOW TABLES ;
DESCRIBE producer ;
LOAD DATA INPATH '/user/hamburgerkid/hive/output/producer/part*'
INTO TABLE producer PARTITION (dt='20100526') ;
SELECT * FROM producer
WHERE LENGTH(word) > 3 AND freq > 1 SORT BY freq DESC LIMIT 10 ;
EXPLAIN SELECT * FROM producer
WHERE LENGTH(word) > 3 AND freq > 1 SORT BY freq DESC LIMIT 10 ;
hadoop fs -ls /user/hive/warehouse/producer/dt=20100526/
hadoop fs -ls /user/hamburgerkid/hive/output/producer/
24. wordcount output
CREATE TABLE app_eng (word STRING , freq INT) PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY 't' STORED AS TEXTFILE ;
LOAD DATA INPATH '/user/hamburgerkid/hive/output/app_eng/part*'
INTO TABLE app_eng PARTITION (dt='20100526') ;
25. Table (JOIN)
CREATE TABLE develop (word STRING, p_freq INT, e_freq INT) ;
INSERT OVERWRITE TABLE develop
SELECT p.word, p.freq, e.freq FROM producer p
JOIN app_eng e ON (p.word = e.word)
WHERE p.freq > 1 AND e.freq > 1 ;
SELECT word, p_freq, e_freq, (p_freq + e_freq) AS ttl FROM develop
WHERE LENGTH(word) > 3 SORT BY ttl DESC LIMIT 10 ;
26. (OUTER JOIN)
SELECT e.word, e.freq, p.freq FROM app_eng e
LEFT OUTER JOIN producer p ON (e.word = p.word)
WHERE LENGTH(e.word) > 3 AND p.freq IS NULL
SORT BY e.freq DESC LIMIT 10 ;
34. wordcount
log = LOAD 'pig/input/app_eng' USING TextLoader() ;
flatd = FOREACH log GENERATE FLATTEN(TOKENIZE((chararray)$0)) AS word ;
grpd = GROUP flatd BY word ;
cntd = FOREACH grpd GENERATE COUNT(flatd) , group ;
STORE cntd INTO 'pig/output/app_eng' ;
hadoop fs -ls /user/hamburgerkid/pig/output/app_eng
hadoop fs -cat /user/hamburgerkid/pig/output/app_eng/part*
35. wordcount
log = LOAD 'pig/input/producer' USING TextLoader() ;
flatd = FOREACH log GENERATE FLATTEN(TOKENIZE((chararray)$0)) AS word ;
grpd = GROUP flatd BY word ;
cntd = FOREACH grpd GENERATE COUNT(flatd) , group ;
STORE cntd INTO 'pig/output/producer' ;
hadoop fs -ls /user/hamburgerkid/pig/output/producer
hadoop fs -cat /user/hamburgerkid/pig/output/producer/part*
36. (JOIN)
eng
= LOAD 'pig/output/app_eng' AS (freq , word) ;
pro
= LOAD 'pig/output/producer' AS (freq , word) ;
cg = COGROUP eng BY word , pro BY word ;
flatd = FOREACH cg GENERATE FLATTEN(eng) , FLATTEN(pro.freq) AS freq2 ;
ttld = FOREACH flatd GENERATE word , SIZE(word) AS size , freq , freq2 , (freq + freq2) AS total ;
fltrd = FILTER ttld BY freq > 1 AND freq2 > 1 AND size > 3L ;
odrd = LIMIT (ORDER fltrd BY total DESC) 10 ;
DUMP odrd ;
STORE odrd INTO 'pig/output/develop' ;
hadoop fs -ls pig/output/develop
hadoop fs -cat pig/output/develop/part*
37. (OUTER JOIN)
eng
= LOAD 'pig/output/app_eng' AS (freq , word) ;
pro
= LOAD 'pig/output/producer' AS (freq , word) ;
cg = COGROUP eng BY word , pro BY word ;
outrd = FILTER cg BY COUNT(eng) == 0 ;
flatd = FOREACH outrd GENERATE FLATTEN(pro) ;
szd = FOREACH flatd GENERATE word , SIZE(word) AS size , freq ;
fltrd = FILTER szd BY size > 3L ;
odrd = LIMIT (ORDER fltrd BY freq DESC) 10 ;
DUMP odrd ;