1.一個文件中,存儲有10億個單詞(數(shù)字+字母組成,每個單詞小于16Byte),每行一個,求出現(xiàn)頻率最高的10個單詞。
算法一:分而治之 + Hash
- 10億個單詞,不能完全加載到內存中處理
- 采用“分而治之”的思想,按照單詞的hash值,將單詞存儲在多個文件中
- 對于每一個小文件,可以構建一個單詞為key,出現(xiàn)次數(shù)為value的Hash map,同時記錄當前出現(xiàn)次數(shù)最高的10個單詞
- 可以得到多個小文件中的出現(xiàn)次數(shù)最多的10個單詞,再依據(jù)常規(guī)的排序算法得到總體上出現(xiàn)次數(shù)最多的10個單詞
//算法一:“分而治之 + Hash“的實現(xiàn)
package com.yunfeng;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BigDataSort{
public final static int FILE_NUM = 10;
public static Set<WordEntity> getTop10Word(File file) //獲取單個文件中,出現(xiàn)頻率最高的10個單詞
{
int count = 0;
Map<String,Integer> map = new HashMap<String, Integer>();
Set<WordEntity> set = new TreeSet<WordEntity>();
Set<WordEntity> setTop10 = new TreeSet<WordEntity>();
try
{
BufferedReader br = new BufferedReader(new FileReader(file));
String s = null;
while ((s = br.readLine())!= null)
{
if (map.get(s) == null) {
count = 1;
} else {
count = map.get(s).intValue() + 1;
}
map.put(s,count);
}
for (String key : map.keySet()) {
set.add(new WordEntity(key,map.get(key)));
}
count = 1;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
setTop10.add(it.next());
if (count == 10)
break;
count++;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return setTop10;
}
public static File[] cutFileByHash(String file) {//切割大文件,file為包含大數(shù)據(jù)的文件
File[] files = new File[FILE_NUM];
BufferedWriter[] brs = new BufferedWriter[FILE_NUM];
try
{
BufferedReader br = new BufferedReader(new FileReader(file));
for(int i = 0;i < FILE_NUM;i++) {
files[i] = new File("file_" + i);
brs[i] = new BufferedWriter(new FileWriter(files[i]));
}
int j = 0;//取模后的值
String s = null;
while ((s = br.readLine())!= null)
{
j = Math.abs(s.hashCode() % FILE_NUM);
brs[j].write(s);
brs[j].newLine();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
for(BufferedWriter b : brs) {
b.close();
}
}
catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
}
return files;
}
public static void main(String[] args) {
File[] files = cutFileByHash("123.txt");//傳入文件名字
Set<WordEntity> set = new TreeSet<WordEntity>();
for(File file : files) {
set.addAll(getTop10Word(file));
}
int count = 1;
WordEntity wordEntity = null;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
wordEntity = it.next();
System.out.println("The top ten word is :" + wordEntity.getKey()
+ ";frequence is " + wordEntity.getCount());
if (count == 10)
break;
count++;
}
}
}
class WordEntity implements Comparable<WordEntity> {//單詞實體
private String key;
private Integer count;
public WordEntity (String key, Integer count) {
this.key = key;
this.count = count;
}
public int compareTo(WordEntity o) {
int cmp = count.intValue() - o.count.intValue();
return (cmp == 0 ? key.compareTo(o.key) : -cmp);
}
public String getKey(){
return key;
}
public Integer getCount(){
return count;
}
}