線程監(jiān)控 - 死鎖、存活周期與 CPU 占用率

寫在前面:

大家學(xué)習(xí)知識(shí)不用死抓怎么實(shí)現(xiàn),很多同學(xué)認(rèn)為學(xué)了套路能做到舉一反三就不錯(cuò)了,這其實(shí)還是停留在“術(shù)”的層面。大家要學(xué)會(huì)了解底層的原理自己去折騰,所以這也是為什么我們要花將近一年左右的時(shí)間,去學(xué) NDK 去學(xué) Linux 內(nèi)核,因?yàn)楹芏鄸|西網(wǎng)上也是搜索不到的。

監(jiān)控死鎖:

主線程死鎖容易 ANR ,其他線程死鎖容易引起異常(不是閃退但會(huì)引起用戶殺死或卸載 App)。開發(fā)需求的時(shí)候我們其實(shí)很少會(huì)自己寫出死鎖( sdk 開發(fā)的除外) 很多情況下都是不小心調(diào)用了第三方的或者系統(tǒng)的一些 API 導(dǎo)致的。那我們有沒有辦法把線上死鎖引起的 ANR 上報(bào)到服務(wù)器呢?或者說有沒有什么方法可以及時(shí)的監(jiān)控到死鎖?先來看一個(gè)死鎖的例子

       Thread thread1 = new Thread(new Runnable() {
            @Override
            public void run() {
                synchronized (deadLock1) {
                    try {
                        sleep_(1);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    synchronized (deadLock2) {
                        Log.e("TAG","thread1");
                    }
                }
            }
        }, "testThread1");

        Thread thread2 = new Thread(new Runnable() {
            @Override
            public void run() {
                synchronized (deadLock2) {
                    try {
                        sleep_(1);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    synchronized (deadLock1) {
                        Log.e("TAG","thread2");
                    }
                }
            }
        }, "testThread2");

這是一個(gè)比較典型的死鎖例子,很多同學(xué)肉眼一般能看出來,但是到了線上我們就得做個(gè)自動(dòng)分析,首先如果在本地排查,我們最好的方法是先 dump 到線程的信息

"testThread1@5890" prio=5 tid=0x5210 nid=NA waiting for monitor entry
  java.lang.Thread.State: BLOCKED
     waiting for testThread2@5889 to release lock on <0x1709> (a java.lang.Object)
      at com.darren.optimize.day13.MainActivity$3.run(MainActivity.java:195)
      - locked <0x1708> (a java.lang.Object)
      at java.lang.Thread.run(Thread.java:784)

"testThread2@5889" prio=5 tid=0x5211 nid=NA waiting for monitor entry
  java.lang.Thread.State: BLOCKED
     waiting for testThread1@5890 to release lock on <0x1708> (a java.lang.Object)
      at com.darren.optimize.day13.MainActivity$4.run(MainActivity.java:212)
      - locked <0x1709> (a java.lang.Object)
      at java.lang.Thread.run(Thread.java:784)

如果我們能拿到線程在等待哪個(gè)鎖釋放,當(dāng)前持有哪個(gè)鎖這兩個(gè)信息的話,那么一切就能迎刃而解了。上期有說的在 java 層是無法做到的,但是我們分析了線程創(chuàng)建的底層原理后在 Native 層找到了答案:

http://androidxref.com/9.0.0_r3/xref/art/runtime/monitor.cc

// 當(dāng)前線程在競爭哪個(gè)鎖
mirror::Object* Monitor::GetContendedMonitor(Thread* thread) {
    // This is used to implement JDWP's ThreadReference.CurrentContendedMonitor, and has a bizarre
    // definition of contended that includes a monitor a thread is trying to enter...
    mirror::Object* result = thread->GetMonitorEnterObject();
    if (result == nullptr) {
        // ...but also a monitor that the thread is waiting on.
        MutexLock mu(Thread::Current(), *thread->GetWaitMutex());
        Monitor* monitor = thread->GetWaitMonitor();
        if (monitor != nullptr) {
            result = monitor->GetObject();
        }
    }
    return result;
}

// 當(dāng)前鎖被哪個(gè)線程持有
uint32_t Monitor::GetLockOwnerThreadId(mirror::Object* obj) {
  DCHECK(obj != nullptr);
  LockWord lock_word = obj->GetLockWord(true);
  switch (lock_word.GetState()) {
    case LockWord::kHashCode:
      // Fall-through.
    case LockWord::kUnlocked:
      return ThreadList::kInvalidThreadId;
    case LockWord::kThinLocked:
      return lock_word.ThinLockOwner();
    case LockWord::kFatLocked: {
      Monitor* mon = lock_word.FatLockMonitor();
      return mon->GetOwnerThreadId();
    }
    default: {
      LOG(FATAL) << "Unreachable";
      UNREACHABLE();
    }
  }
}

有了這兩個(gè)方法,代碼實(shí)現(xiàn)起來就比較簡單了:

  • 獲取所有的線程,判斷是不是 BOLCKED 狀態(tài)
  • 調(diào)用 GetContendedMonitor 與 GetLockOwnerThreadId 獲取到被鎖住的線程
  • 對死鎖進(jìn)行分組,輸出死鎖對應(yīng)的位置
// 初始化
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_nativeInit(JNIEnv *env, jclass clazz, jint level) {
    api_level = level;
    // dlopen libart.so
    void *so_addr = ndk_dlopen("libart.so", RTLD_LAZY);
    if (so_addr == NULL) {
        return 1;
    }
    // Monitor::GetContendedMonitor
    get_contended_monitor = ndk_dlsym(so_addr, "_ZN3art7Monitor19GetContendedMonitorEPNS_6ThreadE");
    if (get_contended_monitor == NULL) {
        return 2;
    }
    // Monitor::GetLockOwnerThreadId
    get_lock_owner_thread = ndk_dlsym(so_addr, get_lock_owner_symbol_name(api_level));
    if (get_lock_owner_thread == NULL) {
        return 3;
    }
    return 0;
}

// 獲取當(dāng)前線程鎖被哪個(gè)線程持有了
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_getContentThreadIdArt(JNIEnv *env, jclass clazz,
                                                                         jlong native_thread) {
    int monitor_thread_id = 0;
    if (get_contended_monitor != nullptr && get_lock_owner_thread != nullptr) {
        int monitorObj = ((int (*)(long)) get_contended_monitor)(native_thread);
        if (monitorObj != 0) {
            monitor_thread_id = ((int (*)(int)) get_lock_owner_thread)(monitorObj);
        } else {
            LOGD("GetContendedMonitor return null");
            monitor_thread_id = 0;
        }
    }
    return monitor_thread_id;
}

// 獲取線程 id
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_getThreadIdFromThreadPtr(JNIEnv *env, jclass clazz,
                                                                            jlong nativeThread) {
    if (nativeThread != 0) {
        if (api_level > 20) {  // 大于5.0系統(tǒng)
            int *pInt = reinterpret_cast<int *>(nativeThread);
            pInt = pInt + 3;
            return *pInt;  // 返回 monitor 所使用的Thread id
        }
    } else {
        LOGE("suspendThreadArt failed");
    }
    return 0;
}

NativeThreadMonitor.nativeInit(Build.VERSION.SDK_INT);
Set<Thread> threads = NativeThreadMonitor.getAllThreads();
for (Thread thread : threads) {
  if (thread.getState() == Thread.State.BLOCKED) {
    long threadAddress = (long) ReflectUtil.getFieldObject(thread, "nativePeer");
    // 這里記一下,找不到地址,或者線程已經(jīng)掛了,此時(shí)獲取到的可能是0和-1
    if (threadAddress <= 0) {
      continue;
    }
    int blockThreadId = NativeThreadMonitor.getContentThreadIdArt(threadAddress);
    int curThreadId = NativeThreadMonitor.getThreadIdFromThreadPtr(threadAddress);
    if (blockThreadId != 0 && curThreadId != 0) {
      deadLock.put(curThreadId, new DeadLockThread(curThreadId, blockThreadId, thread));
    }
  }
}

try {
  // 將所有情況進(jìn)行分組
  ArrayList<HashMap<Integer, Thread>> deadLockThreadGroup = deadLockThreadGroup();
  // 再來找死鎖
  JSONObject objectGroup = new JSONObject();
  for (int i = 0; i < deadLockThreadGroup.size(); i++) {
    // 所有的組拿出來
    HashMap<Integer, Thread> group = deadLockThreadGroup.get(i);
    JSONArray array = new JSONArray();
    for (int curId : group.keySet()) {
      // 獲取 DeadLockThread
      DeadLockThread deadLockThread = deadLock.get(curId);
      if (deadLockThread == null) {
        continue;
      }
      // 獲取等待線程
      Thread waitThread = group.get(deadLockThread.blockId);
      if (waitThread == null) {
        continue;
      }
      Thread deadThread = group.get(curId);
      JSONObject temp = new JSONObject();
      JSONArray stacks = new JSONArray();
      temp.put("thread_name", deadThread.getName());
      temp.put("thread_id", deadThread.getId());
      temp.put("wait_thread", waitThread.getName());
      temp.put("wait_id", waitThread.getId());
      StackTraceElement[] stackTraceElements = deadThread.getStackTrace();
      for (StackTraceElement stackTraceElement : stackTraceElements) {
        stacks.put(stackTraceElement.toString());
      }
      temp.put("thread_stack", stacks);
      array.put(temp);
    }
    objectGroup.put("dead_lock_group_" + i, array);
  }
  Log.e("TAG", objectGroup.toString());
} catch (Exception e) {
    e.printStackTrace();
}

監(jiān)控存活周期:

有些場景下我們想監(jiān)控線程的存活周期,也就是說線程從開始啟動(dòng)到運(yùn)行結(jié)束總共存活了多長時(shí)間,占了多少內(nèi)存,占了多少 CPU 等等,異常的情況下我們線下要給出警告線上要上報(bào)到服務(wù)器。目前我們能想到兩種方案一種是采用之前講的 ASM 插樁的方式,但是這種方案很多場景不適用;還有一種是今天要講到的 Native 插樁。插樁點(diǎn)依舊是之前的線程創(chuàng)建的底層原理:

http://androidxref.com/9.0.0_r3/xref/art/runtime/thread.cc
// 最終想監(jiān)控這個(gè)方法
void* Thread::CreateCallback(void* arg) {
    // ...
}

void *(*old_create_call_back)(void *) = NULL;

void *create_call_back(void *args) {
    // 記錄開始時(shí)間
    long startTime = time(NULL);
    // 調(diào)用原始方法
    void *result = old_create_call_back(args);
    // 獲取當(dāng)前線程信息,計(jì)算輸出存活時(shí)間
    int tid = gettid();
    const char *thread_name = getThreadName(gettid());
    long alive_time = time(NULL) - startTime;
    LOGE("線程信息:thread_id = %d, thread_name = %s, alive_time = %lds", tid, thread_name, alive_time);
    // 獲取內(nèi)存占用,獲取 cpu 占用率,異常情況輸出警告
    return result;
}

extern "C"
JNIEXPORT void JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_monitoringThread(JNIEnv *env, jclass clazz) {
    void *so_addr = ndk_dlopen("libart.so", RTLD_LAZY);
    void *thread_create_call_back = ndk_dlsym(so_addr, "_ZN3art6Thread14CreateCallbackEPv");
    if (registerInlineHook((uint32_t) thread_create_call_back, (uint32_t) create_call_back,
                           (uint32_t **) &old_create_call_back) != ELE7EN_OK) {
        LOGE("monitoringThread registerInlineHook error");
    } else {
        LOGE("monitoringThread registerInlineHook ok");
    }
    if (inlineHook((uint32_t) thread_create_call_back) != ELE7EN_OK) {
        LOGE("monitoringThread inlineHook error");
    } else {
        LOGE("monitoringThread inlineHook ok");
    }
}

監(jiān)控 CPU 占用率:

cpu 占用率比較簡單,我們只需要解析到 /proc/pid/task/tid/stat 與 /proc/pid/stat 即可。

// 進(jìn)程 stat 信息
extern const char *getProgressInfo() {
    // 讀一個(gè)文件
    char *path = (char *) calloc(1, PATH_MAX);
    char *line = (char *) calloc(1, THREAD_NAME_LENGTH);
    snprintf(path, PATH_MAX, "/proc/%d/stat", getpid());
    FILE *commFile = NULL;
    if (commFile = fopen(path, "r")) {
        fgets(line, THREAD_NAME_LENGTH, commFile);
        fclose(commFile);
    }
    if (line) {
        int length = strlen(line);
        if (line[length - 1] == '\n') {
            line[length - 1] = '\0';
        }
    }
    LOGE("progress info ->%s", line);
    free(path);
    return line;
}
// 線程 stat 信息
extern const char *getThreadInfo() {
    // 讀一個(gè)文件
    char *path = (char *) calloc(1, PATH_MAX);
    char *line = (char *) calloc(1, THREAD_NAME_LENGTH);
    snprintf(path, PATH_MAX, "/proc/%d/task/%d/stat", getpid(), gettid());
    FILE *commFile = NULL;
    if (commFile = fopen(path, "r")) {
        fgets(line, THREAD_NAME_LENGTH, commFile);
        fclose(commFile);
    }
    if (line) {
        int length = strlen(line);
        if (line[length - 1] == '\n') {
            line[length - 1] = '\0';
        }
    }
    LOGE("thread info ->%s", line);
    free(path);
    return line;
}

寫在最后:

效能優(yōu)化這東西其實(shí)可做可不做,不像需求能快速的看到收益和效果,所以這也是很多同學(xué)比較缺失的一個(gè)部分。為什么我們要看重這點(diǎn),因?yàn)榻裉焓袌錾媳容^成功的公司基本都做到了"一拖三" 。首先,是團(tuán)隊(duì)很強(qiáng) - 創(chuàng)始人和團(tuán)隊(duì)很強(qiáng),在一個(gè)比較強(qiáng)的團(tuán)隊(duì)帶領(lǐng)下,需要做到另外三點(diǎn),要么是把用戶體驗(yàn)提升了、要么能降低成本、要么能提升效率,有的時(shí)候我們的成本也沒下降,效率也沒提升,但是如果能把用戶體驗(yàn)做得極致,也可以。總之,在一個(gè)優(yōu)秀的、成功的團(tuán)隊(duì)基礎(chǔ)之上,我們只要能夠把用戶體驗(yàn)、能夠把成本或者效率這三者至少做到一點(diǎn),同時(shí)另外兩點(diǎn)又沒有減損的話,基本上就可以成了。

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容